From 7ece850869e9e71b6dc6d6f4ce913f00f6f2c887 Mon Sep 17 00:00:00 2001 From: Siyuan Feng Date: Sun, 22 Sep 2024 14:44:28 +0800 Subject: [PATCH 1/3] [Doc] Update Architecture Overview Update and reorganize architecture documentation This commit updates the architecture documentation by removing outdated files and reorganizing the content. It also updates related sections in the deep dive and developer tutorial. --- docs/arch/benchmark.rst | 137 ---- docs/arch/convert_layout.rst | 269 -------- docs/arch/frontend/tensorflow.rst | 254 ------- docs/arch/hybrid_script.rst | 100 --- docs/arch/index.rst | 218 ++---- docs/arch/inferbound.rst | 763 --------------------- docs/arch/microtvm_design.rst | 357 ---------- docs/arch/microtvm_project_api.rst | 150 ---- docs/arch/model_library_format.rst | 171 ----- docs/arch/relay_intro.rst | 206 ------ docs/arch/relay_op_strategy.rst | 282 -------- docs/arch/virtual_machine.rst | 410 ----------- docs/deep_dive/relax/index.rst | 2 +- docs/deep_dive/tensor_ir/index.rst | 2 +- docs/dev/tutorial/codebase_walkthrough.rst | 2 +- docs/index.rst | 2 +- 16 files changed, 79 insertions(+), 3246 deletions(-) delete mode 100644 docs/arch/benchmark.rst delete mode 100644 docs/arch/convert_layout.rst delete mode 100644 docs/arch/frontend/tensorflow.rst delete mode 100644 docs/arch/hybrid_script.rst delete mode 100644 docs/arch/inferbound.rst delete mode 100644 docs/arch/microtvm_design.rst delete mode 100644 docs/arch/microtvm_project_api.rst delete mode 100644 docs/arch/model_library_format.rst delete mode 100644 docs/arch/relay_intro.rst delete mode 100644 docs/arch/relay_op_strategy.rst delete mode 100644 docs/arch/virtual_machine.rst diff --git a/docs/arch/benchmark.rst b/docs/arch/benchmark.rst deleted file mode 100644 index 8217a4feb7df..000000000000 --- a/docs/arch/benchmark.rst +++ /dev/null @@ -1,137 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -******************************** -Benchmark Performance Log Format -******************************** -This page details schema v0.1 for a unified benchmark log format. This schema will allow easier cross-references with other frameworks/runs, experiment reproduction, data for nightly perf regression, and the separation of logging/visualization efforts. - -Log Format Overview -~~~~~~~~~~~~~~~~~~~ - -For simplicity, we suggest prioritizing the fields `workload`, `engine`, `hardware` `runtime_ms_mean`, and `runtime_ms_std`. For finer-grained logging, one may additionally propagate the `*_config` fields. - -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| header | examples | category | notes/justification | -+=======================+==============================================================================================================================================================================+==============+==============================================================================+ -| workload | resnet-18 | workload | name of workload | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| engine | "tvm" / "onnxruntime" | compiler | | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| hardware | "gcp-c2-standard-16" | hardware | descriptor of target hardware environment | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| runtime_ms_mean | 12.452 | statistics | | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| runtime_ms_std | 5.3 | statistics | | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| timestamp | 1572282699.6 | metadata | indicates when this record is logged | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| schema\_version | "0.1" | metadata | ensure reproducibility as we iterate on this schema | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| metadata | { "docker\_tag":"gcr.io/.../0a680", ... } | metadata | ``docker_tag`` is optional | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| workload\_args | {“input\_name”: "Input3", “input\_shape”: [list\_of\_shape], “data\_layout”: NHCW} | workload | | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| workload\_metadata | {"class": "vision","doc\_url": "``https://github.com/.../README.md``", "opset": 7,"type": "body\_analysis","url": "``https://onnxzoo...ferplus.tar.gz``", "md5": "07fc7..."} | workload | source of workload | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| engine\_version | "1.0.5" | compiler | use semvar format | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| engine\_config | {“llvm”: “llvm-8”, “nvcc”: 10.1, "accelerator": "MLAS", "relay_opt_level": 3, "tvm_target":"llvm -mcpu=cascadelake"} | compiler | fields are optionally specified | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| compilation\_config | {"opt_level": 3, "layer_schedules":[]/ } | compiler | fields are optionally specified | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| software\_config | {"os": "ubuntu:18.04","pip": { "docker": "4.1.0", "gitpython": "3.0.4", "numpy": "1.17.4", "onnx": "1.6.0"}, “cudnn”: “cudnn-8”, "cuda_driver”: “480.10.1”} | backend | env dependency list | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| runtime\_config | {"num_cpu_threads": 3} | backend | info on non-hardware, non-software metadata | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| hardware\_config | {"cpu_count": 16, "cloud_machine_type":"c2-standard-16", "memory_GB":64} | hardware | json descriptor of target hardware environment | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| execution\_config | {“number”: 1, “repeat”: 10, “min\_repeat\_ms”, 0} | statistics | workload execution parameters | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| metrics | {“accuracy”: 48.5,“compilation_ms_mean”: 12} | statistics | other metrics | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ -| runtime_raw | [{"runtime_ms": 12, ...}, {"runtime_ms":13,...},...] | statistics | optional raw metrics array | -+-----------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------+------------------------------------------------------------------------------+ - - - -Storage format -~~~~~~~~~~~~~~ -Currently we're prototyping benchmark data as JSON objects for extensibility and convenience, especially in early versions of the schema. However, as we scale up benchmark aggregation and stabilize parameters, we anticipate switching to a columnar format, such as Arrow or Parquet. - -Here is sample data encoded as JSON: - -:: - - { - "workload":"arcface_resnet100", - "engine":"tvm", - "hardware":"gcp-c2-standard-16", - "runtime_ms_mean":109.43004820081924, - "runtime_ms_std":0.09078385126800587, - "timestamp":"20191123003411", - "schema_version":"0.1", - "metadata":{ - "docker_tag":"tlcpack/ci-gpu:v0.53" - }, - "workload_args":{ - "input_shape_dict":{ - "data":[ - 1, - 3, - 112, - 112 - ] - }, - "input_type_dict":{ - "data":"float32" - }, - "input_value_dict":{} - }, - "workload_metadata":{ - "class":"vision", - "doc_url":"https://github.com/onnx/models/blob/main/vision/body_analysis/arcface/README.md", - "md5":"66074b860f905295aab5a842be57f37d", - "opset":8, - "type":"body_analysis", - "url":"https://s3.amazonaws.com/onnx-model-zoo/arcface/resnet100/resnet100.tar.gz" - }, - "engine_version":"1.0.0", - "engine_config":{}, - "compilation_config":{ - "relay_opt_level": 3 - }, - "software_config":{ - "os":"ubuntu:18.04", - "pip":{ - "docker":"4.1.0", - "gitpython":"3.0.4", - "numpy":"1.17.4", - "onnx":"1.6.0" - } - }, - "runtime_config":{}, - "hardware_config":{ - "cloud_machine_type":"c2-standard-16", - "cloud_provider":"GCP", - "cpu_count":16, - "cpu_platform":"Intel Cascade Lake", - "memory_GB":64 - }, - "execution_config":{}, - "metrics":{} - } diff --git a/docs/arch/convert_layout.rst b/docs/arch/convert_layout.rst deleted file mode 100644 index 51917fce44df..000000000000 --- a/docs/arch/convert_layout.rst +++ /dev/null @@ -1,269 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at -.. http://www.apache.org/licenses/LICENSE-2.0 -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -=================== -Convert Layout Pass -=================== -**Author**: `Animesh Jain `_ - -************* -1. Background -************* - -Data layout format describes how the data is laid out in the memory. For example, Tensorflow framework default data layout for convolution operator is NHWC, i.e, the data is 4-dimensions and is laid out in row-major format with N being the first dimension and C being the last dimension. Data layout has a major role in model performance, significantly affecting spatial and temporal locality. For example, Intel x86 backend in TVM prefers layout as NCHWc where the C dimension is tiled in 2 dimensions to exploit data locality efficiently. Similarly, CUDA backend prefers the data layout to be in NCHW format. - -Essentially, TVM has to deal with data layouts throughout the compiler toolchain - Framework parsers, Relay layout transformations, and TOPI schedules. As we move towards third-party codegen integration, which might have their own data layout restrictions, handling layouts at all levels in TVM toolchain is going to become even more challenging. Therefore, we developed a new Relay pass - **ConvertLayout** -- to reduce some of the complications that arise due to layout handling. - -If you directly want to understand the usage of ConvertLayout Pass, directly jump to Section 4 - Usage. - -************************** -2. Motivation and Overview -************************** - -Let's look at a simple scenario to understand the complications that arise due to different layouts - Suppose we want to compile a Tensorflow NHWC graph for an ARM edge device. But, suppose we currently support only NCHW schedules in TOPI for ARM. So, there is a mismatch between framework layout and TOPI-supported layout. One way to deal with this mismatch is to insert layout transforms before each and after convolution, such that resulting convolution has NCHW input data layout and can use TOPI schedules. However, this can lead to performance degradation because of the presence of too many layout transforms. - -We encountered similar problems in other use cases as well - -- No way to run TFLite graphs on Nvidia GPUs. TOPI has NCHW-only schedules for GPUs. -- Ever-complicating logic in AlterOpLayout for convolution to support different pairs of layout transformations. -- Sub-optimal performance for TF graphs due to extra layout transforms. -- Complication in third-party codegen integrations like TensorRT that prefers data layout to be in one format. - -To solve these problems, we introduced *ConvertLayout* pass that sets up the infrastructure to change the data layout of the whole graph with minimal number of data layout transforms. In ideal cases, we will have only 2 layout transforms for data, one at the start and one at the end. An example to show the transformation is below - - -.. code-block:: python - - # Original graph - 2 convolutions in NHWC format. - fn (%x: Tensor[(1, 56, 56, 64), float32], %weight1: Tensor[(3, 3, 64, 32), float32], %weight2: Tensor[(3, 3, 32, 32), float32]) { - %0 = nn.conv2d(%x, %weight1, padding=[1, 1], channels=32, kernel_size=[3, 3], data_layout="NHWC", kernel_layout="HWIO"); - %1 = nn.relu(%0); - %2 = nn.conv2d(%1, %weight2, padding=[1, 1], channels=32, kernel_size=[3, 3], data_layout="NHWC", kernel_layout="HWIO"); - nn.relu(%2) - } - - # After ConvertLayout - For data, there is a transform at the start and at the end. - # For weights, there are transforms to adapt to NCHW layout. These will be removed by FoldConstant pass. - fn (%x: Tensor[(1, 56, 56, 64), float32], %weight1: Tensor[(3, 3, 64, 32), float32], %weight2: Tensor[(3, 3, 32, 32), float32]) { - %0 = layout_transform(%x, src_layout="NHWC", dst_layout="NCHW") /* ty=Tensor[(1, 64, 56, 56), float32] */; - %1 = layout_transform(%weight1, src_layout="HWIO", dst_layout="OIHW") /* ty=Tensor[(32, 64, 3, 3), float32] */; - %2 = nn.conv2d(%0, %1, padding=[1, 1], channels=32, kernel_size=[3, 3]) /* ty=Tensor[(1, 32, 56, 56), float32] */; - %3 = nn.relu(%2) /* ty=Tensor[(1, 32, 56, 56), float32] */; - %4 = layout_transform(%weight2, src_layout="HWIO", dst_layout="OIHW") /* ty=Tensor[(32, 32, 3, 3), float32] */; - %5 = nn.conv2d(%3, %4, padding=[1, 1], channels=32, kernel_size=[3, 3]) /* ty=Tensor[(1, 32, 56, 56), float32] */; - %6 = nn.relu(%5) /* ty=Tensor[(1, 32, 56, 56), float32] */; - layout_transform(%6, src_layout="NCHW", dst_layout="NHWC") /* ty=Tensor[(1, 56, 56, 32), float32] */ - } - - -********* -3. Design -********* - -Before delving into ConvertLayout pass, let's categorize the operators into 3 categories based on their sensitivity to data layouts. This categorization will be useful later to understand Convertlayout pass details. - -- **Layout agnostic** - Relu, pow etc. These operators are not affected, neither functionality nor performance, by data layouts. -- **Lightly-layout sensitive** - pad, concatenate, reduce ops like sum etc. These operators have some attributes that are functionally affected if we do a layout transformation before them. However, performance-wise, the difference is not significant. For these operators, it is beneficial to just adapt to the previous operator output data layout. -- **Heavily-layout sensitive** - Convolution, conv2d_transpose etc. These operators are heavily affected, both functionally and performance-wise, by data layouts. They also have data layout as the op attribute. Typically, it is beneficial to modify the input data layouts for these operators (if its not a performant data layout), while the rest of *layout agnostic* and *lightly-layout sensitive* operators adapt to the layout governed by the output of these *heavliy-layout sensitive* operators. - - -Let us now look at two relevant Relay operator properties. Each relay operator has properties, like InferType, that can be defined by a TVM developer. Typically, a Relay pass traverses the graph operator-by-operator and reads these operator properties. For example, InferType pass looks at the InferType property of on operator, determines its output shape and type, and then passes it to the next operator InferType property. Similarly, in our context, we have 2 such properties - *FTVMConvertLayout* and *FInferCorrectLayout*. ConvertLayout pass traverses the graph and looks at these 2 properties along with an automatic layout transform insertion module to handle data layouts. So, the whole process can be broken down into 3 steps: - -- Run FTVMConvertLayout property - This allows the developers to transform the original Relay expr into a new Relay expr with new layouts, allowing user-defined layout alteration. There is a python callback for developer's ease. This is used only for heavily-layout sensitive operators. -- Run FTVMInferCorretLayout property - We can view this as layout inference. It looks at the original input layout and the new input layouts, which are either coming from previous operator or from the FTVMConvertLayout modified expr (if it was used). This can be used by lightly-layout sensitive operators to adapt its attributes to new data layouts. Layout inference happens for each operator. -- Automatic insertion of layout transforms - The previous step - layout inference - sets the new layout for the input exprs. If these layouts are different from the original layouts, then this component automatically inserts a layout transform. Therefore, a developer does not need to do anything for this component. - -These steps happen for each operator in sequence, where ConvertLayout pass keeps on passing the new layouts to the next operator properties, finally resulting in modifying the whole graph operator-by-operator. Now, let's look at a couple of examples of how to define the two properties. - -**FTVMConvertLayout - Python callback for layout alteration** - This is used for *heavily-layout sensitive* operators. For example, one can return a new convolution operator with new data and kernel layout. The other 2 components will infer layout and insert layout transforms if needed. One example for convolution operator is as follows where we are converting to NCHW layout. - -.. code-block:: python - - @reg.register_convert_op_layout("nn.conv2d") - def convert_conv2d(attrs, inputs, tinfos, desired_layouts): - """Convert Layout pass registration for conv2d op. - - Parameters - ---------- - attrs : tvm.attrs.Attrs - Attributes of current convolution - inputs : list of tvm.relay.Expr - The args of the Relay expr to be legalized - tinfos : list of types - List of input and output types - desired_layouts : list of layout strings - List of layouts defining our desired - layout for the data and kernel inputs respectively. - - Returns - ------- - result : tvm.relay.Expr - The transformed expr - """ - - from tvm import relay - data, weight = inputs - new_attrs = dict(attrs) - - # We expect 2 desired layouts to be specified, one for the data and one for the kernel. - assert len(desired_layouts) == 2, "A desired layout is expected for both of nn.conv2d's inputs" - - # Use the first entry in desired layouts which specifies the data layout. - # The expected ordering of layouts for this operator is defined by this function. - desired_data_layout, desired_kernel_layout = map(str, desired_layouts) - - assert desired_data_layout != "default", "Data layout cannot be default" - - new_attrs['data_layout'] = desired_data_layout - - if desired_data_layout == 'NCHW': - if desired_kernel_layout != 'default': - new_attrs['kernel_layout'] = desired_kernel_layout - else: - new_attrs['kernel_layout'] = 'OIHW' - # Actual insertion of layout transforms is taken care internally - # by ConvertLayout pass. - return relay.nn.conv2d(data, weight, **new_attrs) - - raise ValueError('Layout %s is not yet supported' % desired_data_layout) - - -**FInferCorrectLayout - Layout inference** - Currently, this attribute is exposed only in C++. This function takes original input layouts and the new input layouts (passed from the previous operator or from the python callback for layout alteration), and infers the final data layouts. Layout inference is called for each operator. The usage might vary for different operator categories. For layout agnostic operators, we just want to return the new data layouts in this function. For lightly-layout and heavily-layout sensitive operators, we can change the operator attributes (like axis for concatenate, pad_width for pad) so that we can adapt to the new data layout, preventing insertion of layout transforms. Let's look at a couple of examples to understand this better. - -First example is for layout agnostic operators. These operators do not have any operator attributes that are affected by data layouts, so we just adapt to new layouts. - -.. code-block:: c++ - - // For operator set its attributes like following - // .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout); - - // Take arbitrary input layouts and copy to outputs. - inline Array> ElemwiseArbitraryLayout(const Attrs& attrs, - const Array& new_in_layouts, - const Array& old_in_layouts, - const Array> &old_in_shapes) { - Layout ret; - - if (new_in_layouts.defined()) { - ICHECK_GE(new_in_layouts.size(), 1); - ret = new_in_layouts[0]; - } else { - for (size_t i = 0; i < old_in_layouts.size(); ++i) { - if (old_in_layouts[i].defined()) { - ret = old_in_layouts[i]; - break; - } - } - } - - return Array>{Array(old_in_layouts.size(), ret), {ret}}; - } - - -Second example is for a lightly-layout sensitive operator - batch normalization. BatchNorm has an axis operator that has to change when we go from NHWC to NCHW data layout. (Similar handling also needs to be for heavily-layout sensitive operators) - - -.. code-block:: c++ - - Array> BatchNormInferCorrectLayout(const Attrs& attrs, - const Array& new_in_layouts, - const Array& old_in_layouts, - const Array>& old_in_shapes) { - BatchNormAttrs* param = const_cast(attrs.as()); - - size_t axis = - param->axis < 0 ? param->axis + old_in_shapes[0].size() : static_cast(param->axis); - - Layout ret = Layout::Undef(); - - // For example, consider old_layout = NHWC, and new_layout = NCHW, and param->axis = 3 - - if (new_in_layouts.defined() && old_in_layouts.defined()) { - // Get the new C axis. Extract the dim in old layout. Find the index of that dim in next layout. - - // Following line gives bn_dim = C as old_layout = NHWC, axis = 3 - const auto& bn_dim = old_in_layouts[0][axis]; - - // The new_index is 1 because new_layout = NCHW and bn_dim is C - auto new_index = new_in_layouts[0].IndexOf(bn_dim); - - // We modify the layout-dependent attribute here - axis to 1. - param->axis = new_index; - - // Finally, we adapt to the new layout. - ret = new_in_layouts[0]; - - } else if (old_in_layouts.defined()) { - ret = old_in_layouts[0]; - } - - // In case both new and old layouts are undefined, then there is no need of a change. - // ConvertLayout pass skips the automatic insertion of layout transforms in this case. - - // Following line is not important to tutorial. But, layout inference needs to define - // the layout for all input and output data layouts. For batch norm, the other inputs - // and outputs are vector having length of C dim in the input. So, we set the other - // layouts as C. BN has 5 inputs, 3 outputs. The last 4 inputs and last 2 outputs - // have "C" layout. - Layout c_layout = Layout("C"); - - return Array>{{ret, c_layout, c_layout, c_layout, c_layout}, - {ret, c_layout, c_layout}}; - } - - -******** -4. Usage -******** -.. _convert-layout-usage: - -ConvertLayout pass is extremely easy to use. The pass is not a part of default relay.build pipeline. The intended usage is to call it between the framework-to-relay parser and relay.build module call. - -In order to specify the layouts to convert to, we create a mapping of heavily-layout sensitive operators to a list of the desired layouts for that operator. The first example below specifies data layout, we allow the kernel layout to be automatically converted to one that is supported by TVM (for that particular data layout and operator). This is specified by the use of the "default" keyword. The second example shows how we could have also converted to a specific kernel layout of our choosing. It's worth noting that the following examples will convert to the same layouts i.e. `{'nn.conv2d': ['NCHW', 'default']} == {'nn.conv2d': ['NCHW', 'OIHW']}` - -.. code-block:: python - - # TFlite framework to Relay parser - Default layout is NHWC - mod, params = relay.frontend.from_tflite(tflite_model, - shape_dict=shape_dict, - dtype_dict=dtype_dict) - - # We assume our model's heavily-layout sensitive operators only consist of nn.conv2d - desired_layouts = {'nn.conv2d': ['NCHW', 'default']} - - # Convert the layout to NCHW - # RemoveUnunsedFunctions is used to clean up the graph. - seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(), - relay.transform.ConvertLayout(desired_layouts)]) - with tvm.transform.PassContext(opt_level=3): - mod = seq(mod) - - # Call relay compilation - with relay.build_config(opt_level=3): - graph, lib, params = relay.build(mod, target, params=params) - - -.. code-block:: python - - desired_layouts = {'nn.conv2d': ['NCHW', 'OIHW']} - pass = relay.transform.ConvertLayout(desired_layouts) - - -The ordering of the layouts is defined by the implementation of `register_convert_op_layout("OPNAME")`, you can refer to the docstring which should explicitly state the expected layout. In the examples above it's [data_layout, kernel_layout]. - -Current implementation has support for almost all the operators commonly used in image classification models. However, if one encounters too many data layout transforms in the graph, it is highly likely that there is an operator whose layouts need special handling as described in Section 3. Some pull requests that can help in such a situation are - -- Layout inference for `Batch Norm `_ - Batch normalization falls into the category of lightly-sensitive operator. The PR shows how to handle the layout inference for batch norm. -- Python Callback for `Convolution `_- For highly-sensitive operators, one might have to do python callback as well. The PR shows how to define a python callback function for Convolution operator. diff --git a/docs/arch/frontend/tensorflow.rst b/docs/arch/frontend/tensorflow.rst deleted file mode 100644 index dde7179d90db..000000000000 --- a/docs/arch/frontend/tensorflow.rst +++ /dev/null @@ -1,254 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -TensorFlow Frontend -=================== - -The TensorFlow frontend helps in importing TensorFlow models into TVM. - -Supported versions: - -- 1.12 and below - -Tested models: - -- Inception (V1/V2/V3/V4) -- Resnet (All) -- Mobilenet (V1/V2 All) -- Vgg (16/19) -- BERT (Base/3-layer) - -Preparing a Model for Inference -------------------------------- - -Remove Unneeded Nodes -~~~~~~~~~~~~~~~~~~~~~ - -The export process will remove many nodes that are not needed for inference, but unfortunately will leave some remaining. The nodes that should be manually removed are: - -- Dropout, including `Dropout`_ and `DropoutWrapper`_ -- `Assert`_ - -.. _Dropout: https://www.tensorflow.org/api_docs/python/tf/nn/dropout -.. _DropoutWrapper: https://www.tensorflow.org/versions/r1.12/api_docs/python/tf/nn/rnn_cell/DropoutWrapper?hl=hr -.. _Assert: https://www.tensorflow.org/api_docs/python/tf/debugging/Assert - -Convert None Dimensions to Constants -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -TVM has minimal support for dynamic tensor shapes. Dimensions that are ``None`` should be replaced with constants. For example, a model may accept an input with shape ``(None,20)``. This should be converted to a shape like ``(1,20)``. The model should be modified accordingly to ensure that these shapes match throughout the graph. - -Export -~~~~~~ - -TensorFlow frontend expects a frozen protobuf (.pb) or saved model as input. It currently does not support checkpoint (.ckpt). The graphdef needed by the TensorFlow frontend can be extracted from the active session, or by using the `TFParser`_ helper class. - -.. _TFParser: https://github.com/apache/tvm/blob/main/python/tvm/relay/frontend/tensorflow_parser.py - -The model should be exported with a number of transformations to prepare the model for inference. It is also important to set ```add_shapes=True```, as this will embed the output shapes of each node into the graph. Here is one function to export a model as a protobuf given a session: - -.. code:: python - - import tensorflow as tf - from tensorflow.tools.graph_transforms import TransformGraph - - def export_pb(session): - with tf.gfile.GFile("myexportedmodel.pb", "wb") as f: - inputs = ["myinput1", "myinput2"] # replace with your input names - outputs = ["myoutput1"] # replace with your output names - graph_def = session.graph.as_graph_def(add_shapes=True) - graph_def = tf.graph.util.convert_variables_to_constants(session, graph_def, outputs) - graph_def = TransformGraph( - graph_def, - inputs, - outputs, - [ - "remove_nodes(op=Identity, op=CheckNumerics, op=StopGradient)", - "sort_by_execution_order", # sort by execution order after each transform to ensure correct node ordering - "remove_attribute(attribute_name=_XlaSeparateCompiledGradients)", - "remove_attribute(attribute_name=_XlaCompile)", - "remove_attribute(attribute_name=_XlaScope)", - "sort_by_execution_order", - "remove_device", - "sort_by_execution_order", - "fold_batch_norms", - "sort_by_execution_order", - "fold_old_batch_norms", - "sort_by_execution_order" - ] - ) - f.write(graph_def.SerializeToString()) - -Another method is to `export and freeze the graph `_. - -Import the Model ----------------- - -Explicit Shape: -~~~~~~~~~~~~~~~ - -To ensure shapes can be known throughout the entire graph, pass the ```shape``` argument to ```from_tensorflow```. This dictionary maps input names to input shapes. Please refer to these `test cases `_ for examples. - -Data Layout -~~~~~~~~~~~ - -Most TensorFlow models are released with NHWC layout. NCHW layout often provides better performance, especially on GPU. The TensorFlow frontend can automatically convert the model's data layout by passing the argument ```layout='NCHW'``` to ```from_tensorflow```. - -Best Practices --------------- - -- Use static tensor shapes instead of dynamic shapes (remove ```None``` dimensions). -- Use static RNN instead of dynamic RNN, as ```TensorArray``` isn't supported yet. - -Supported Ops -------------- - -- Abs -- Add -- AddN -- All -- Any -- ArgMax -- ArgMin -- AvgPool -- BatchMatMul -- BatchMatMulV2 -- BatchNormWithGlobalNormalization -- BatchToSpaceND -- BiasAdd -- BroadcastTo -- Cast -- Ceil -- CheckNumerics -- ClipByValue -- Concat -- ConcatV2 -- Conv2D -- Cos -- Tan -- CropAndResize -- DecodeJpeg -- DepthwiseConv2dNative -- DepthToSpace -- Dilation2D -- Equal -- Elu -- Enter -- Erf -- Exit -- Exp -- ExpandDims -- Fill -- Floor -- FloorDiv -- FloorMod -- FusedBatchNorm -- FusedBatchNormV2 -- Gather -- GatherNd -- GatherV2 -- Greater -- GreaterEqual -- Identity -- IsFinite -- IsInf -- IsNan -- LeakyRelu -- LeftShift -- Less -- LessEqual -- Log -- Log1p -- LoopCond -- LogicalAnd -- LogicalOr -- LogicalNot -- LogSoftmax -- LRN -- LSTMBlockCell -- MatMul -- Max -- MaxPool -- Maximum -- Mean -- Merge -- Min -- Minimum -- MirrorPad -- Mod -- Mul -- Neg -- NextIteration -- NotEqual -- OneHot -- Pack -- Pad -- PadV2 -- Pow -- Prod -- Range -- Rank -- RealDiv -- Relu -- Relu6 -- Reshape -- ResizeBilinear -- ResizeBicubic -- ResizeNearestNeighbor -- ReverseV2 -- RightShift -- Round -- Rsqrt -- Select -- Selu -- Shape -- Sigmoid -- Sign -- Sin -- Size -- Slice -- Softmax -- Softplus -- SpaceToBatchND -- SpaceToDepth, -- Split -- SplitV -- Sqrt -- Square -- SquareDifference -- Squeeze -- StridedSlice -- Sub -- Sum -- Switch -- Tanh -- TensorArrayV3 -- TensorArrayScatterV3 -- TensorArrayGatherV3 -- TensorArraySizeV3 -- TensorArrayWriteV3 -- TensorArrayReadV3 -- TensorArraySplitV3 -- TensorArrayConcatV3 -- Tile -- TopKV2 -- Transpose -- TruncateMod -- Unpack -- UnravelIndex -- Where -- ZerosLike diff --git a/docs/arch/hybrid_script.rst b/docs/arch/hybrid_script.rst deleted file mode 100644 index a4fce342f728..000000000000 --- a/docs/arch/hybrid_script.rst +++ /dev/null @@ -1,100 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -Hybrid Frontend Developer Guide -=============================== - -If you are a developer: - -1. who is trying writing some preliminary patterns that have not been supported by TVM yet, -maybe :ref:`hybrid-langref-label` is a better place for you. - -2. who wants to know the implementation details of this module, you are right here! - -Features --------- - -Software Emulation -~~~~~~~~~~~~~~~~~~ - -In software emulation, the most interesting thing is the decorator ``tvm.te.hybrid.script``. -This decorator helps 2 things: - -1. Importing runtime variables - -2. Overloading the function according to the arguments passed - -Correct me if I am wrong: I believe that how 1. is implemented is dangerous, but I have no -choice. What I did is to add those names into python dict ``func.__global__`` and after -the call to ``func`` is done, those names will be cleaned up. - -Overload is simple: the decorator checks the arguments' types and determines which function -should be actually called. - - -Backend Compilation -~~~~~~~~~~~~~~~~~~~ - -Compilation is a large module, you can see ``python/tvm/te/hybrid/`` for more -details. The first stage determines the usage, or more accurately the -declaration of each variable and the second stage does the actual IR -generation. - -Attributes -~~~~~~~~~~ - -So far, ONLY tensors' `shape` attribute is supported. You can see ``visit_Subscript`` -in ``python/tvm/te/hybrid/parser.py`` for more details. This is a hacky solution, I just -check the attributes when subscript. - -Loops -~~~~~ - -In HalideIR, loops have in total 4 types: ``serial``, ``unrolled``, ``parallel``, and ``vectorized``. - - -.. note:: - - Unlike what that is in HalideIR, in ``loop_type(a, b)``, ``a`` is the starting point and ``b`` - is the trip count of iterations. Here ``loop_type(a, b)`` indicates ``[a, b)``. Thus, when lowering it - to HalideIR, we need to do ``start, extent = a, b - a`` - - -.. note:: - - In HalideIR those are enums, they are in passive form. - Here we use active form to annotate loops, because they are ready to run. - - -Variables -~~~~~~~~~ - -Because there is no variables in ``HalideIR``, all the mutable variables will be lowered to an array with size 1. -It takes the first store of a variable as its declaration. - -Math Intrinsics -~~~~~~~~~~~~~~~ -So far, these math intrinsics, ``log``, ``exp``, ``sigmoid``, ``tanh``, ``power``, and ``popcount``, are supported. -Math intrinsics will be imported by the decorator. Most of the intrinsics are borrowed by library implementation -except ``popcount`` and ``sigmoid``. I implemented them manually. - - -Casting -~~~~~~~ - -You can cast values by using the keywords ``uint8``, ``uint16`` ``uint32``, ``uint64``, ``int8``, ``int16``, ``int32``, ``int64``, -``float16``, ``float32``, ``float64``. diff --git a/docs/arch/index.rst b/docs/arch/index.rst index 17884a774253..cf4829268ee2 100644 --- a/docs/arch/index.rst +++ b/docs/arch/index.rst @@ -18,46 +18,37 @@ Design and Architecture ======================= -This document is intended for developers who want to understand the -architecture of TVM and/or actively develop on the project. +This document is intended for developers who want to understand the architecture of Apache TVM and/or actively develop on the project. This page is organized as follows: -- The `Example Compilation Flow`_ gives an overview of the steps that TVM takes to turn a high level description of a model into a deployable module. +- The `Overall Flow`_ gives an overview of the steps that TVM takes to turn a high level description of a model into a deployable module. To get started, please read this section first. - -- The `Logical Architecture Components`_ section describes the logical components. - The sections after are specific guides focused on each logical component, organized - by the component's name. - -- The :ref:`Device/Target Interactions ` - page describes how TVM interacts with each supported physical device - and code-generation target. - -- Feel free to also check out the :ref:`dev-how-to` for useful development tips. +- Brief introduction to the key components of the TVM stack. Feel free to also check out the :ref:`TensorIR Deep Dive ` + and :ref:`Relax Deep Dive ` for more details about the two major components in the TVM stack. This guide provides a few complementary views of the architecture. First, we review a single end-to-end compilation flow and discuss the key data structures and the transformations. This runtime-based view focuses on the interactions of each components when running the compiler. Then we will review the logical modules of the codebase and their relationship. This part provides a static overarching view of the design. - -Example Compilation Flow ------------------------- +Overall Flow +------------ In this guide, we will study an example compilation flow in the compiler. The figure below shows the flow. At a high-level, it contains several steps: -- Import: The frontend component ingests a model into an IRModule, which contains a collection of functions that internally represent the model. -- Transformation: The compiler transforms an IRModule to another functionally equivalent or approximately +- **Model Creation**: Create the IRModule to be optimized and compiled, which contains a collection of functions that internally represent the model. + Users can manually construct IRModule via NNModule, TVMScript, or import a pre-trained model from from Relax frontend. +- **Transformation**: The compiler transforms an IRModule to another functionally equivalent or approximately equivalent(e.g. in the case of quantization) IRModule. Many of the transformations are target (backend) independent. We also allow target to affect the configuration of the transformation pipeline. -- Target Translation: The compiler translates(codegen) the IRModule to an executable format specified by the target. +- **Target Translation**: The compiler translates(codegen) the IRModule to an executable format specified by the target. The target translation result is encapsulated as a `runtime.Module` that can be exported, loaded, and executed on the target runtime environment. -- Runtime Execution: the user loads back a `runtime.Module` and runs the compiled functions in the supported runtime environment. +- **Runtime Execution**: the user loads back a `runtime.Module` and runs the compiled functions in the supported runtime environment. -.. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_dyn_workflow.svg +.. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_overall_flow.svg :align: center - :width: 85% + :width: 80% Key data structures @@ -70,13 +61,14 @@ components that either define a collection of key data structures or transformat **IRModule** is the primary data structure used across the entire stack. An IRModule (intermediate representation module) contains a collection of functions. Currently, we support two primary variants of functions. -- **relay::Function** is a high-level functional program representation. A relay.Function usually corresponds to an end-to-end model. - You can view a relay.Function as a computational graph with additional support for control-flow, recursion, and complex data structures. +- **relax::Function** is a high-level functional program representation. A relax.Function represents high-level graph structure, + usually corresponds to an end-to-end model or a sub-graph of the overall model. You can view a relax.Function as a computational + graph with additional support for control-flow, and complex data structures. - **tir::PrimFunc** is a low-level program representation that contains elements including loop-nest choices, multi-dimensional load/store, threading, and vector/tensor instructions. It is usually used to represent an operator program that executes a (possibly-fused) layer in a model. -During the compilation, a relay function may be lowered to multiple tir::PrimFunc functions and a top-level function that calls into -those tir::PrimFunc functions. +During the compilation and transformation, all relax operators are lowered to ``tir::PrimFunc`` or ``TVM PackedFunc``, which can be executed directly +on the target device, while the calls to relax operators are lowered to calls to low-level functions (e.g. ``R.call_tir`` or ``R.call_dps``). Transformations ~~~~~~~~~~~~~~~ @@ -86,44 +78,35 @@ Now that we have covered the key data structures, let us talk about the transfor - optimization: transform a program to an equivalent, possibly more optimized version. - lowering: transform a program to a lower-level representation that is closer to the target. -**relay/transform** contains a collection of passes that optimize the model. The optimizations include common program -optimizations such as constant folding and dead-code elimination, and tensor-computation specific passes such as layout -transformation and scaling factor folding. - -Near the end of the relay optimization pipeline, we will run a pass(FuseOps) to break the end-to-end function(e.g. MobileNet) -into sub-function(e.g. conv2d-relu) segments. We call these segments of functions. -This process helps us to divide the original problem into two sub-problems: - -- Compilation and optimization for each sub-function. -- Overall execution structure: we need to do a sequence of calls into the generated sub-functions to execute the whole model. - -We use the low-level tir phase to compile and optimize each sub-functions. For specific targets, we may also directly go to the target translation -phase and use external code generators. - -There are a few different ways(in relay/backend) to handle the calls into the overall execution problem. For simple models with known shapes and no control flow, we can lower to a graph executor that stores the execution structure in a graph. We also support a virtual machine backend for dynamic executions. Finally, we plan to support ahead of time compilation that compiles the high-level execution structure into the executable and generated primitive functions. All of these execution modes are encapsulated by a unified **runtime.Module** interface, which we will discuss in the latter part of the guide. +relax transformations +^^^^^^^^^^^^^^^^^^^^^ +relax transformations contain a collection of passes that apply to relax functions. The optimizations include common graph-level +optimizations such as constant folding and dead-code elimination for operators, and backend-specific optimizations such as library dispatch. -**tir/transform** contains transformation passes for TIR level functions. Many tir passes serve the purpose of lowering. For example, there are passes to flatten multi-dimensional access to one-dimensional pointer access, to expand the intrinsics into target-specific ones, and to decorate the function entry to meet the runtime calling convention. Of course, there are also optimizations passes, such as access index simplification and dead code elimination. +tir transformations +^^^^^^^^^^^^^^^^^^^ +tir transformations contain a collection of passes that apply to tir functions. There are two major types of transformations: -Many low-level optimizations can be handled in the target phase by the LLVM, CUDA C, and other target compilers. As a result, we leave low-level optimizations such as register allocation to the downstream compilers and only focus on optimizations that are not covered by them. +- **TensorIR schedule**: TensorIR schedules are designed to optimize the TensorIR functions for a specific target, with user-guided instructions and control how the target code is generated. + For CPU targets, TIR PrimFunc can generate valid code and execute on the target device without schedule but with very-low performance. However, for GPU targets, the schedule is essential + for generating valid code with thread bindings. For more details, please refer to the :ref:`TensorIR Transformation ` section. Additionally, we provides ``MetaSchedule`` to + automate the search of TensorIR schedule. +- **Lowering Passes**: These passes usually perform after the schedule is applied, transforming a TIR PrimFunc into another functionally equivalent PrimFunc, but closer to the + target-specific representation. For example, there are passes to flatten multi-dimensional access to one-dimensional pointer access, to expand the intrinsics into target-specific ones, + and to decorate the function entry to meet the runtime calling convention. -Search-space and Learning-based Transformations -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Many low-level optimizations can be handled in the target phase by the LLVM, CUDA C, and other target compilers. As a result, we leave low-level optimizations such as register allocation + to the downstream compilers and only focus on optimizations that are not covered by them. -The transformation passes we described so far are deterministic and rule-based. One design goal of the TVM stack is to support high-performance code optimizations for different hardware platforms. To do so, we will need to investigate as many optimization choices as possible, including but not limited to, multi-dimensional tensor access, loop tiling behavior, special accelerator memory hierarchy, and threading. +cross-level transformations +^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Apache TVM brings a unity strategy to optimize the end-to-end models. As the IRModule includes both relax and tir functions, the cross-level transformations are designed to mutate +the IRModule by applying different transformations to these two types of functions. -It is hard to define a heuristic to make all of the choices. Instead, we will take a search and learning-based approach. -We first define a collection of actions we can take to transform a program. Example actions include loop transformations, inlining, -vectorization. We call these actions **scheduling primitives**. The collection of scheduling primitives defines a search space of possible -optimizations we can make to a program. The system then searches over different possible scheduling -sequence to pick the best scheduling combination. -The search procedure is usually guided by a machine learning algorithm. - -We can record the best schedule sequence for an (possibly-fused) operator once the search is completed. The compiler can then just lookup the best -schedule sequence and apply it to the program. Notably, this schedule application phase is **exactly like** the rule-based transformations, -enabling us to share the same interface convention with tradition passes. - -We use search based optimizations to handle the initial tir function generation problem. This part of the module is called AutoTVM(auto_scheduler). -We expect to expand the learning-based transformations to more areas as we continue to develop the TVM stack. +For example, ``relax.LegalizeOps`` pass mutates the IRModule by lowering relax operators, add corresponding TIR PrimFunc into the IRModule, and replace the relax operators +with calls to the lowered TIR PrimFunc. Another example is operator fusion pipeline in relax (including ``relax.FuseOps`` and ``relax.FuseTIR``), which fuse multiple consecutive tensor operations +into one. Different from the previous implementations, relax fusion pipeline analyzes the pattern of TIR functions and detects the best fusion rules automatically rather +than human-defined operator fusion patterns. Target Translation ~~~~~~~~~~~~~~~~~~ @@ -204,19 +187,6 @@ except that the data structure of interest changes from the numpy.ndarray to tvm - Manipulate the IR directly using TVM's python API. -Logical Architecture Components -------------------------------- - -.. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_static_overview.svg - :align: center - :width: 85% - - TVM Architecture Diagram - -The above figure shows the major logical components in the project. Please read the following sections -for information about the components and their relations. - - tvm/support ----------- The support module contains the most common utilities for the infrastructure, such as generic arena allocator, socket, and logging. @@ -243,22 +213,19 @@ These hardware-specific runtime modules define APIs for device memory allocation device and benchmark the execution performance. The rpc infrastructure enables data collection from a wide range of hardware backends for learning-based optimizations. - .. toctree:: :maxdepth: 1 runtime - .. toctree:: :maxdepth: 1 debugger - virtual_machine introduction_to_module_serialization device_target_interactions - +.. TODO(tvm-team) add a section about relax vm here tvm/node -------- @@ -275,11 +242,9 @@ Thanks to the node module, we can directly access any field of the TVM's IRNode # we can directly use the field name to access the IR structures assert y.a == x - We can also serialize arbitrary IR node into a JSON format, and load them back. The ability to save/store, and inspect an IR node provides a foundation for making the compiler more accessible. - tvm/ir ------ The `tvm/ir` folder contains the unified data structure and interfaces across for all IR function variants. @@ -331,11 +296,25 @@ in the target and builtin information registered to each target id(cuda, opencl) device_target_interactions +tvm/relax +--------- + +Relax is the high-level IR used to represent the computational graph of a model. Various optimizations are defined in ``relax.transform``. +Note that Relax usually works closely the the TensorIR IRModule, most of the transformations are applied on the both Relax and TensorIR functions +in the IRModule. Please refer to the :ref:`Relax Deep Dive ` for more details. + tvm/tir ------- TIR contains the definition of the low-level program representations. We use `tir::PrimFunc` to represent functions that can be transformed by TIR passes. -Besides the IR data structures, the tir module also defines a set of builtin intrinsics and their attributes via the common Op registry, as well as transformation passes in `tir/transform`. +Besides the IR data structures, the tir module also includes: + +- A set of schedule primitives to control the generated code in ``tir/schedule``. +- A set of builtin intrinsics in ``tir/tensor_intrin``. +- A set of analysis passes to analyze the TIR functions in ``tir/analysis``. +- A set of transformation passes to lower or optimize the TIR functions in ``tir/transform``. + +Please refer to the :ref:`TensorIR Deep Dive ` for more details. tvm/arith --------- @@ -344,75 +323,28 @@ This module is closely tied to the TIR. One of the key problems in the low-level arithmetic properties — the positiveness, variable bound, and the integer set that describes the iterator space. arith module provides a collection of tools that do (primarily integer) analysis. A TIR pass can use these analyses to simplify and optimize the code. -tvm/te ------- - -The name te stands for "tensor expression". This is a domain-specific language module that allows us to construct `tir::PrimFunc` variants quickly by writing tensor expressions. -Importantly, a tensor expression itself is not a self-contained function that can be stored into IRModule. Instead, it is a fragment of IR that we can stitch together to build an IRModule. +tvm/te and tvm/topi +------------------- -`te/schedule` provides a collection of scheduling primitives to control the function being generated. In the future, we might bring some of -these scheduling components to the a `tir::PrimFunc` itself. +TE stands for Tensor Expression. TE is a domain-specific language (DSL) for describing tensor computations. Importantly, a tensor expression +itself is not a self-contained function that can be stored into IRModule. We can use ``te.create_prim_func`` to convert a tensor expression to a ``tir::PrimFunc`` +and then integrate it into the IRModule. -.. toctree:: - :maxdepth: 1 - - inferbound - hybrid_script - -tvm/topi --------- While possible to construct operators directly via TIR or tensor expressions (TE) for each use case it is tedious to do so. -`topi` (Tensor operator inventory) provides a set of pre-defined operators (in TE or TIR) defined by -numpy and found in common deep learning workloads. We also provide a collection of common schedule templates to obtain performant implementations across different target platforms. - - -tvm/relay ---------- -Relay is the high-level functional IR used to represent full models. Various optimizations are defined in `relay.transform`. The Relay compiler defines multiple dialects, -and each dialect is designed to support specific styles of optimization. Notable ones include QNN(for importing pre-quantized models), VM(for lowering to dynamic virtual machine), -memory(for memory optimization). - -.. toctree:: - :maxdepth: 1 - - relay_intro - relay_op_strategy - convert_layout - - -tvm/autotvm ------------ +`topi` (Tensor operator inventory) provides a set of pre-defined operators defined by numpy and found in common deep learning workloads. -AutoTVM and AutoScheduler are both components which automate search based program optimization. This is rapidly evolving and primarily consists of: +tvm/meta_schedule +----------------- -- Cost models and feature extraction. -- A record format for storing program benchmark results for cost model construction. -- A set of search policies over program transformations. +MetaSchedule is a system for automated search-based program optimization. It is designed to be a drop-in replacement for AutoTVM and AutoScheduler, +and can be used to optimize TensorIR schedules. Note that MetaSchedule only works with static-shape workloads. -Automated program optimization is still an active research field. As a result, we have attempted to modularize the design so that researchers may quickly modify a -component or apply their own algorithms via the Python bindings, and -customize the search and plugin their algorithms from the Python binding. - -.. toctree:: - :maxdepth: 1 - - benchmark - -Frontends ---------- -Frontends ingest models from different frameworks into the TVM stack. -:py:mod:`tvm.relay.frontend` is the namespace for model ingestion APIs. - -.. toctree:: - :maxdepth: 1 - - frontend/tensorflow +tvm/dlight +---------- -microTVM --------- -.. toctree:: - :maxdepth: 1 +DLight is a set of pre-defined, easy-to-use, and performant TIR schedules. DLight aims: - microtvm_design - microtvm_project_api - model_library_format +- Fully support **dynamic shape workloads**. +- **Light weight**. DLight schedules provides tuning-free or (very few-shots tuning) schedule with reasonable performance. +- **Robust**. DLight schedules are designed to be robust and general-purpose for a single rule. And if the rule is not applicable, + DLight not raise any error and switch to the next rule automatically. diff --git a/docs/arch/inferbound.rst b/docs/arch/inferbound.rst deleted file mode 100644 index cc516359bdba..000000000000 --- a/docs/arch/inferbound.rst +++ /dev/null @@ -1,763 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _dev-InferBound-Pass: - -******************************************* -InferBound Pass -******************************************* - - -The InferBound pass is run after normalize, and before ScheduleOps `build_module.py `_. The main job of InferBound is to create the bounds map, which specifies a Range for each IterVar in the program. These bounds are then passed to ScheduleOps, where they are used to set the extents of For loops, see `MakeLoopNest `_, and to set the sizes of allocated buffers (`BuildRealize `_), among other uses. - -The output of InferBound is a map from IterVar to Range: - -.. code:: cpp - - Map InferBound(const Schedule& sch); - -Therefore, let's review the Range and IterVar classes: - -.. code:: cpp - - namespace HalideIR { - namespace IR { - class RangeNode : public Node { - public: - Expr min; - Expr extent; - // remainder omitted - }; - }} - - namespace tvm { - class IterVarNode : public Node { - public: - Range dom; - Var var; - // remainder omitted - }; - } - -Note that IterVarNode also contains a Range ``dom``. This ``dom`` may or may not have a meaningful value, depending on when the IterVar was created. For example, when ``tvm.compute`` is called, an `IterVar is created `_ for each axis and reduce axis, with dom's equal to the shape supplied in the call to ``tvm.compute``. - -On the other hand, when ``tvm.split`` is called, `IterVars are created `_ for the inner and outer axes, but these IterVars are not given a meaningful ``dom`` value. - -In any case, the ``dom`` member of an IterVar is never modified during InferBound. However, keep in mind that the ``dom`` member of an IterVar is sometimes used as default value for the Ranges InferBound computes. - -We next review some TVM codebase concepts that are required to understand the InferBound pass. - -Recall that InferBound takes one argument, a Schedule. This schedule object, and its members, contains all information about the program being compiled. - -A TVM schedule is composed of Stages. Each stage has exactly one Operation, e.g., a ComputeOp or a TensorComputeOp. Each operation has a list of root_iter_vars, which in the case of ComputeOp, are composed of the axis IterVars and the reduce axis IterVars. Each operation can also contain many other IterVars, but all of them are related by the operations's list of IterVarRelations. Each IterVarRelation represents either a split, fuse or rebase in the schedule. For example, in the case of split, the IterVarRelation specifies the parent IterVar that was split, and the two children IterVars: inner and outer. - - -.. code:: cpp - - namespace tvm { - class ScheduleNode : public Node { - public: - Array outputs; - Array stages; - Map stage_map; - // remainder omitted - }; - - class StageNode : public Node { - public: - Operation op; - Operation origin_op; - Array all_iter_vars; - Array leaf_iter_vars; - Array relations; - // remainder omitted - }; - - class OperationNode : public Node { - public: - virtual Array root_iter_vars(); - virtual Array InputTensors(); - // remainder omitted - }; - - class ComputeOpNode : public OperationNode { - public: - Array axis; - Array reduce_axis; - Array body; - Array root_iter_vars(); - // remainder omitted - }; - } - -Tensors haven't been mentioned yet, but in the context of TVM, a Tensor represents output of an operation. - -.. code:: cpp - - class TensorNode : public Node { - public: - // The source operation, can be None - // This Tensor is output by this op - Operation op; - // The output index from the source operation - int value_index; - }; - -In the Operation class declaration above, we can see that each operation also has a list of InputTensors. Thus the stages of the schedule form a DAG, where each stage is a node in the graph. There is an edge in the graph from Stage A to Stage B, if the operation of Stage B has an input tensor whose source operation is the op of Stage A. Put simply, there is an edge from A to B, if B consumes a tensor produced by A. See the diagram below. This graph is created at the beginning of InferBound, by a call to `CreateReadGraph `_. - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/stage_graph.png - :align: center - -InferBound makes one pass through the graph, visiting each stage exactly once. InferBound starts from the output stages (i.e., the solid blue nodes in the graph above), and moves upwards (in the opposite direction of the edges). This is achieved by performing a reverse topological sort on the nodes of the graph. Therefore, when InferBound visits a stage, each of its consumer stages has already been visited. - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/inferbound_traversal.png - :align: center - -The InferBound pass is shown in the following pseudo-code: - -.. code:: cpp - - Map InferBound(const Schedule& sch) { - Array outputs = sch->get_outputs(); - G = CreateGraph(outputs); - stage_list = sch->reverse_topological_sort(G); - Map rmap; - for (Stage s in stage_list) { - InferRootBound(s, &rmap); - PassDownDomain(s, &rmap); - } - return rmap; - } - -The InferBound pass has two interesting properties that are not immediately obvious: - -1. After InferBound visits a stage, the ranges of all IterVars in the stage will be set in ``rmap``. -2. The Range of each IterVar is only set once in ``rmap``, and then never changed. - -So it remains to explain what InferBound does when it visits a stage. As can be seen in the pseudo-code above, InferBound calls two functions on each stage: InferRootBound, and PassDownDomain. The purpose of InferRootBound is to set the Range (in ``rmap``) of each root_iter_var of the stage. (Note: InferRootBound does not set the Range of any other IterVar, only those belonging to root_iter_vars). The purpose of PassDownDomain is to propagate this information to the rest of the stage's IterVars. When PassDownDomain returns, all IterVars of the stage have known Ranges in ``rmap``. - -The remainder of the document dives into the details of InferRootBound and PassDownDomain. Since PassDownDomain is simpler to describe, we will cover it first. - -.. _IterVarHyperGraph: - -IterVar Hyper-graph -------------------- - -The InferBound pass traverses the stage graph, as described above. However, within each stage is another graph, whose nodes are IterVars. InferRootBound and PassDownDomain perform message-passing on these IterVar graphs. - -Recall that all IterVars of the stage are related by IterVarRelations. The IterVarRelations of a stage form a directed acyclic hyper-graph, where each node of the graph corresponds to an IterVar, and each hyper-edge corresponds to an IterVarRelation. We can also represent this hyper-graph as a DAG, which is simpler to visualize as shown below. - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/relations.png - :align: center - - -The above diagram shows the IterVar hyper-graph for one stage. The stage has one root_iter_var, ``i``. It has been split, and the resulting inner axis ``i.inner``, has been split again. The leaf_iter_vars of the stage are shown in green: ``i.outer``, ``i.inner.outer``, and ``i.inner.inner``. - -Message passing functions are named "PassUp" or "PassDown", depending on whether messages are passed from children to their parent in the DAG ("PassUp"), or from the parent to its children ("PassDown"). For example, the large arrow on the left-hand side of the diagram above, shows that PassDownDomain sends messages from the root IterVar ``i`` to its children ``i.outer`` and ``i.inner``. - -.. _PassDownDomain: - -PassDownDomain --------------- -The purpose of PassDownDomain is to take the Ranges produced by InferRootBound for the root_iter_vars, and set the Ranges of all other IterVars in the stage. - -PassDownDomain iterates through the stage's IterVarRelations. There are three possible types of IterVarRelation: split, fuse, and rebase. The most interesting case (since it offers opportunity for improvement), is IterVarRelations representing splits. - -The Ranges of the inner and outer IterVars of the split are set based on the parent IterVar's known Range, as follows: - -.. code:: cpp - - rmap[split->inner] = Range::FromMinExtent(0, split->factor) - rmap[split->outer] = Range::FromMinExtent(0, DivCeil(rmap[split->parent]->extent, split->factor)) - -There is an opportunity here to tighten the bounds produced by InferBound, when ``split->factor`` does not evenly divide the parent's extent. Suppose the parent's extent is 20, and the split factor is 16. Then on the second iteration of the outer loop, the inner loop only needs to perform 4 iterations, not 16. If PassDownDomain could set the extent of ``split->inner`` to ``min(split->factor, rmap[split->parent]->extent - (split->outer * split->factor))``, then the extent of the inner variable would properly adapt, based on which iteration of the outer loop is being executed. - -For Fuse relations, the Range of the fused IterVar is set based on the known Ranges of the inner and outer IterVars, as follows: - -.. code:: cpp - - rmap[fuse->fused] = Range::FromMinExtent(0, rmap[fuse->outer]->extent * rmap[fuse->inner]->extent) - - -InferRootBound --------------- - -Recall that InferBound calls InferRootBound, followed by :ref:`PassDownDomain` on each stage in the stage graph. The purpose of InferRootBound is to set the Range of each root_iter_var of the Stage's operation. These Ranges will be propagated to the rest of the stage's IterVars using :ref:`PassDownDomain`. Note that InferRootBound does not set the Range of any other IterVar, only those belonging to the stage's root_iter_vars. - -If the stage is an output stage or placeholder, InferRootBound simply sets the root_iter_var Ranges to their default values. The default Range for a root_iter_var is taken from the ``dom`` member of the IterVar (see the IterVarNode class declaration above). - -Otherwise, InferRootBound iterates through the consumers of the stage. IntSets are created for each of the consumer's IterVars, as follows. Phase 1) IntSets are initialized for the consumer's leaf_iter_vars, and propagated to the consumer's root_iter_vars by PassUpDomain (Phase 2). These IntSets are used to create TensorDom of the input tensors of the consumer stage (Phase 3). Finally, once all of the consumers have been processed, InferRootBound calls GatherBound, to set the Ranges of the stage's root_iter_vars, based on the TensorDoms (Phase 4). - -This process can seem complicated. One reason is that a stage can have more than one consumer. Each consumer has different requirements, and these must somehow be consolidated. Similarly, the stage may output more than one tensor, and each consumer only uses a particular subset of these tensors. Furthermore, even if a consumer uses a particular tensor, it may not use all elements of the tensor. - -As mentioned above, a consumer may only require a small number of elements from each tensor. The consumers can be thought of as making requests to the stage, for certain regions of its output tensors. The job of Phases 1-3 is to establish the regions of each output tensor that are required by each consumer. - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/inferbound_phases.png - :align: center - -IntSets -~~~~~~~ - -During InferRootBound, Ranges are converted to IntSets, and message passing is performed over IntSets. Therefore, it is important to understand the difference between Ranges and IntSets. The name "IntSet" suggests it can represent an arbitrary set of integers, e.g., A = \{-10, 0, 10, 12, 13\}. This would certainly be more expressive than a Range, which only represents a set of contiguous integers, e.g., B = \{10,11,12\}. - -However, currently IntSets come in only three varieties: IntervalSets, StrideSets, and ModularSets. IntervalSets, similarly to Ranges, only represent sets of contiguous integers. A StrideSet is defined by a base IntervalSet, a list of strides, and a list of extents. However, StrideSet is unused, and ModularSet is only used by the frontend. - -Therefore, not all sets of integers can be represented by an IntSet in TVM currently. For example, set A in the example above can not be represented by an IntSet. However, in future the functionality of IntSet can be extended to handle more general kinds of integer sets, without requiring modification to users of IntSet. - -*InferBound is more complicated for schedules that contain compute_at. Therefore, we first explain InferBound for schedules that do not contain compute_at.* - -.. _Phase1: - -Phase 1: Initialize IntSets for consumer's leaf_iter_vars -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: cpp - - /* - * Input: Map rmap: contains the Range for each IterVar of the consumer stage - * Output: Map up_state: contains an IntSet for each leaf_iter_var of the consumer - */ - -In Phase 1, IntSets for each of the consumer's leaf_iter_vars are created, based on the Ranges of the leaf_iter_vars from ``rmap``. Recall that the consumer has already been visited by InferBound, so all of its IterVars have known Ranges in ``rmap``. - -There are three cases: - -- Case 1: Extent of leaf var's Range is 1. In this case, the up_state for the leaf is just a single point, equal to the Range's min. -- Case 2: *No relaxation is needed. In this case, the up_state for the leaf is just a single point, defined by the leaf var itself.* -- Case 3: Relaxation is needed. In this case, the leaf's Range is simply converted to an IntSet. - -For simplicity, we assume the schedule does not contain thread axes. In this case, Case 2 is only relevant if the schedule contains compute_at. Please refer to the section :ref:`InferBoundCA`, for further explanation. - -.. _Phase2: - -Phase 2: Propagate IntSets from consumer's leaves to consumer's roots -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: cpp - - /* - * Input: Map up_state: consumer leaf -> IntSet - * Output: Map dom_map: consumer root -> IntSet - */ - -The purpose of Phase 2 is to propagate the IntSet information from the consumer's leaf_iter_vars to the consumer's root_iter_vars. The result of Phase 2 is another map, ``dom_map``, that contains an IntSet for each of the consumer's root_iter_vars. - -Phase 2 begins by calling PassUpDomain, which visits the IterVarRelations of the consumer stage. In the case of a Split relation, PassUpDomain sets the up_state of the parent IterVar, based on the inner and outer IntSets, as follows: - -- Case 1: The Ranges of outer and inner IterVars match their ``up_state`` domains. In this case, set the parent's ``up_state`` by simply converting the parent's Range to an IntSet. -- Case 2: *Otherwise, the parent's* ``up_state`` *is defined by evaluating* ``outer*f + inner + rmap[parent]->min``, *with respect to the* ``up_state`` *of outer and inner. Here, instead of using the Split relation's factor, TVM uses* ``f = rmap[inner]->extent``. - -Case 2 is only needed if the schedule contains compute_at. Please refer to the section :ref:`InferBoundCA` below, for further explanation. - -After PassUpDomain has finished propagating up_state to all IterVars of the consumer, a fresh map, from root_iter_vars to IntSet, is created. If the schedule does not contain compute_at, the IntSet for root_iter_var ``iv`` is created by the following code: - -.. code:: cpp - - dom_map[iv->var.get()] = IntSet::range(up_state.at(iv).cover_range(iv->dom)); - -Note that if the schedule does not contain compute_at, Phases 1-2 are actually unnecessary. dom_map can be built directly from the known Ranges in rmap. Ranges simply need to be converted to IntSets, which involves no loss of information. - -.. _Phase3: - -Phase 3: Propagate IntSets to consumer's input tensors -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: cpp - - /* - * Input: Map dom_map: consumer root -> IntSet - * Output: Map tmap: output tensor -> vector> - */ - -Note that the consumer's input tensors are output tensors of the stage InferBound is working on. So by establishing information about the consumer's input tensors, we actually obtain information about the stage's output tensors too: the consumers require certain regions of these tensors to be computed. This information can then be propagated through the rest of the stage, eventually obtaining Ranges for the stage's root_iter_vars by the end of Phase 4. - -The output of Phase 3 is tmap, which is a map containing all of the stage's output tensors. Recall that a Tensor is multi-dimensional, with a number of different axes. For each output tensor, and each of that tensor's axes, tmap contains a list of IntSets. Each IntSet in the list is a request from a different consumer. - -Phase 3 is accomplished by calling PropBoundToInputs on the consumer. PropBoundToInputs adds IntSets to tmap's lists, for all input Tensors of the consumer. - -The exact behavior of PropBoundToInputs depends on the type of the consumer's operation: ComputeOp, TensorComputeOp, PlaceholderOp, ExternOp, etc. Consider the case of TensorComputeOp. A TensorComputeOp already has a Region for each of its Tensor inputs, defining the slice of the tensor that the operation depends on. For each input tensor i, and dimension j, a request is added to tmap, based on the corresponding dimension in the Region: - -.. code:: cpp - - for (size_t j = 0; j < t.ndim(); ++j) { - // i selects the Tensor t - tmap[i][j].push_back(EvalSet(region[j], dom_map)); - } - -.. _Phase4: - -Phase 4: Consolidate across all consumers -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: cpp - - /* - * Input: Map tmap: output tensor -> vector> - * Output: Map rmap: rmap is populated for all of the stage's root_iter_vars - */ - -Phase 4 is performed by GatherBound, whose behavior depends on the type of operation of the stage. We discuss the ComputeOp case only, but TensorComputeOp is the same. - -A ComputeOp has only a single output Tensor, whose axes correspond to the axis variables of the ComputeOp. The root_iter_vars of a ComputeOp include these axis variables, as well as the reduce_axis variables. If the root IterVar is an axis var, it corresponds to one of the axes of the output Tensor. GatherBound sets the Range of such a root IterVar to the union of all IntSets (i.e., union of all consumer requests) for the corresponding axis of the tensor. If the root IterVar is a reduce_axis, its Range is just set to its default (i.e., the ``dom`` member of IterVarNode). - -.. code:: cpp - - // 'output' selects the output tensor - // i is the dimension - rmap[axis[i]] = arith::Union(tmap[output][i]).cover_range(axis[i]->dom); - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/gatherbound.png - :align: center - - -The union of IntSets is computed by converting each IntSet to an Interval, and then taking the minimum of all minimums, and the maximum of all of these interval's maximums. - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/union.png - :align: center - - -This clearly results in some unnecessary computation, i.e., tensor elements will be computed that are never used. - -Unfortunately, even if we're lucky and the IntervalSet unions do not produce unnecessary computation, the fact that GatherBound considers each dimension of the tensor separately can also cause unnecessary computation. For example, in the diagram below the two consumers A and B require disjoint regions of the 2D tensor: consumer A requires T[0:2, 0:2], and consumer B requires T[2:4, 2:4]. GatherBound operates on each dimension of the tensor separately. For the first dimension of the tensor, GatherBound takes the union of intervals 0:2 and 2:4, producing 0:4 (note that no approximation was required here). Similarly for the second dimension of the tensor. Therefore, the dimension-wise union of these two requests is T[0:4, 0:4]. So GatherBound will cause all 16 elements of tensor T to be computed, even though only half of those elements will ever be used. - - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/gatherbound_problem.png - :align: center - -.. _InferBoundCA: - -InferBound with compute_at --------------------------- - -If the schedule contains compute_at, Phases 1-2 of InferRootBound become more complex. - -Motivation -~~~~~~~~~~ - -**Ex. 1** - -Consider the following snippet of a TVM program: - -:: - - C = tvm.compute((5, 16), lambda i, j : tvm.const(5, "int32"), name='C') - D = tvm.compute((5, 16), lambda i, j : C[i, j]*2, name='D') - -This produces the following (simplified IR): - -:: - - for i 0, 5 - for j 0, 16 - C[i, j] = 5 - for i 0, 5 - for j 0, 16 - D[i, j] = C[i, j]*2 - -It's easy to see that stage D requires all (5,16) elements of C to be computed. - -**Ex. 2** - -However, suppose C is computed at axis j of D: - -:: - - s = tvm.create_schedule(D.op) - s[C].compute_at(s[D], D.op.axis[1]) - -Then only a single element of C is needed at a time: - -:: - - for i 0, 5 - for j 0, 16 - C[0] = 5 - D[i, j] = C[0]*2 - -**Ex. 3** - -Similarly, if C is computed at axis i of D, only a vector of 16 elements of C are needed at a time: - -:: - - for i 0, 5 - for j 0, 16 - C[j] = 5 - for j 0, 16 - D[i, j] = C[j]*2 - -Based on the above examples, it is clear that InferBound should give different answers for stage C depending on where in its consumer D it is "attached". - -.. _AttachPaths: - -Attach Paths -~~~~~~~~~~~~ - -If stage C is computed at axis j of stage D, we say that C is *attached* to axis j of stage D. This is reflected in the Stage object by setting the following three member variables: - -.. code:: cpp - - class StageNode : public Node { - public: - // omitted - - // For compute_at, attach_type = kScope - AttachType attach_type; - - // For compute_at, this is the axis - // passed to compute_at, e.g., D.op.axis[1] - IterVar attach_ivar; - - // The stage passed to compute_at, e.g., D - Stage attach_stage; - - // omitted - }; - -Consider the above examples again. In order for InferBound to determine how many elements of C must be computed, it is important to know whether the computation of C occurs within the scope of a leaf variable of D, or above that scope. For example, in Ex. 1, the computation of C occurs *above* the scopes of all of D's leaf variables. In Ex. 2, the computation of C occurs *within* the scope of all of D's leaf variables. In Ex. 3, C occurs within the scope of D's i, but above the scope of D's j. - -CreateAttachPath is responsible for figuring out which scopes contain a stage C. These scopes are ordered from innermost scope to outermost. Thus for each stage CreateAttachPath produces an "attach path", which lists the scopes containing the stage, from innermost to outermost scope. In Ex. 1, the attach path of C is empty. In Ex. 2, the attach path of C contains {j, i}. In Ex. 3, the attach path of C is {i}. - -The following example clarifies the concept of an attach path, for a more complicated case. - -**Ex. 4** - -:: - - C = tvm.compute((5, 16), lambda i, j : tvm.const(5, "int32"), name='C') - D = tvm.compute((4, 5, 16), lambda di, dj, dk : C[dj, dk]*2, name='D') - s = tvm.create_schedule(D.op) - s[C].compute_at(s[D], D.op.axis[2]) - -Here is the IR after ScheduleOps (note that loops with extent 1 have been preserved, using the ``debug_keep_trivial_loop`` argument of ScheduleOps): - -:: - - realize D([0, 4], [0, 5], [0, 16]) { - produce D { - for (di, 0, 4) { - for (dj, 0, 5) { - for (dk, 0, 16) { - realize C([dj, 1], [dk, 1]) { - produce C { - for (i, 0, 1) { - for (j, 0, 1) { - C((i + dj), (j + dk)) =5 - } - } - } - D(di, dj, dk) =(C(dj, dk)*2) - } - } - } - } - } - } - -In this case, the attach path of C is {dk, dj, di}. Note that C does not use di, but di still appears in C's attach path. - -**Ex. 5** - -Compute_at is commonly applied after splitting, but this can be handled very naturally given the above definitions. In the example below, the attachment point of C is j_inner of D. The attach path of C is {j_inner, j_outer, i}. - -:: - - C = tvm.compute((5, 16), lambda i, j : tvm.const(5, "int32"), name='C') - D = tvm.compute((5, 16), lambda i, j : C[i, j]*2, name='D') - s = tvm.create_schedule(D.op) - d_o, d_i = s[D].split(D.op.axis[1], factor=8) - s[C].compute_at(s[D], d_i) - -The IR in this case looks like: - -:: - - for i 0, 5 - for j_outer 0, 2 - for j_inner 0, 8 - C[0] = 5 - D[i, j_outer*8 + j_inner] = C[0]*2 - -Building an Attach Path -~~~~~~~~~~~~~~~~~~~~~~~ - -We continue to refer to stages C and D, as introduced in the previous section. The CreateAttachPath algorithm builds the attach path of a stage C as follows. If C does not have attach_type ``kScope``, then C has no attachment, and C's attach path is empty. Otherwise, C is attached at attach_stage=D. We iterate through D's leaf variables in top-down order. All leaf variables starting from C.attach_ivar and lower are added to C's attach path. Then, if D is also attached somewhere, e.g., to stage E, the process is repeated for E's leaves. Thus CreateAttachPath continues to add variables to C's attach path until a stage with no attachment is encountered. - -In the example below, C is attached at D, and D is attached at E. - -:: - - C = tvm.compute((5, 16), lambda ci, cj : tvm.const(5, "int32"), name='C') - D = tvm.compute((5, 16), lambda di, dj : C[di, dj]*2, name='D') - E = tvm.compute((5, 16), lambda ei, ej : D[ei, ej]*4, name='E') - s = tvm.create_schedule(E.op) - s[C].compute_at(s[D], D.op.axis[1]) - s[D].compute_at(s[E], E.op.axis[1]) - -With ``debug_keep_trivial_loop=True``, the attach path of C is {dj, di, ej, ei}, and the attach path of D is {ej, ei}: - -:: - - // attr [D] storage_scope = "global" - allocate D[int32 * 1] - // attr [C] storage_scope = "global" - allocate C[int32 * 1] - produce E { - for (ei, 0, 5) { - for (ej, 0, 16) { - produce D { - for (di, 0, 1) { - for (dj, 0, 1) { - produce C { - for (ci, 0, 1) { - for (cj, 0, 1) { - C[(ci + cj)] = 5 - } - } - } - D[(di + dj)] = (C[(di + dj)]*2) - } - } - } - E[((ei*16) + ej)] = (D[0]*4) - } - } - } - -InferBound with compute_at -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Now that the concept of an attach path has been introduced, we return to how InferBound differs if the schedule contains compute_at. The only difference is in InferRootBound, :ref:`Phase1` and :ref:`Phase2`. - -In InferRootBound, the goal is to determine Ranges for the root_iter_vars of a particular stage, C. Phases 1-2 of InferRootBound assign IntSets to the leaf IterVars of C's consumers, and then propagate those IntSets up to the consumers' root_iter_vars. - -If there are no attachments, the Ranges already computed for the consumer's variables define how much of C is needed by the consumer. However, if the stage is actually inside the scope of one of the consumer's variables j, then only a single point within the Range of j is needed at a time. - -.. _Phase1CA: - -Phase 1: Initialize IntSets for consumer's leaf_iter_vars -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: cpp - - /* - * Input: Map rmap: contains the Range for each IterVar of the consumer stage - * Output: Map up_state: contains an IntSet for each leaf_iter_var of the consumer - */ - -In Phase 1, IntSets for each of the consumer's leaf_iter_vars are created, based on the Ranges of the leaf_iter_vars from rmap. Recall that the consumer has already been visited by InferBound, so all of its IterVars have known Ranges in rmap. - -There are three cases: - -- Case 1: Extent of leaf var's Range is 1. In this case, the up_state for the leaf is just a single point, equal to the Range's min. -- Case 2: No relaxation is needed. In this case, the up_state for the leaf is just a single point, defined by the leaf var itself. -- Case 3: Relaxation is needed. In this case, the leaf's Range is simply converted to an IntSet. - -Case 2 occurs if we encounter the attachment point of stage C in the consumer. For this attach_ivar, and all higher leaf variables of the consumer, Case 2 will be applied. This ensures that only a single point within the Range of the leaf variable will be requested, if C is inside the leaf variable's scope. - -.. _Phase2CA: - -Phase 2: Propagate IntSets from consumer's leaves to consumer's roots -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -.. code:: cpp - - /* - * Input: Map up_state: consumer leaf -> IntSet - * Output: Map dom_map: consumer root -> IntSet - */ - -Phase 2 begins by calling PassUpDomain, which visits the IterVarRelations of the consumer stage. In the case of a Split relation, PassUpDomain sets the up_state of the parent IterVar, based on the inner and outer IntSets, as follows: - -- Case 1: The Ranges of outer and inner IterVars match their ``up_state`` domains. In this case, set the parent's ``up_state`` by simply converting the parent's Range to an IntSet. -- Case 2: Otherwise, the parent's ``up_state`` is defined by evaluating ``outer*f + inner + rmap[parent]->min``, with respect to the ``up_state`` of outer and inner. Here, instead of using the Split relation's factor, TVM uses* ``f = rmap[inner]->extent``. - - -Now, because the schedule contains compute_at, it is possible for Case 2 to apply. This is because the leaf IntSets may now be initialized to a single point within their Range (Case 2 of :ref:`Phase1CA`), so the IntSets will no longer always match the Ranges. - -After PassUpDomain has finished propagating up_state to all IterVars of the consumer, a fresh map, from root_iter_vars to IntSet, is created. If the stage is not attached to the current consumer, then for each variable iv in the consumer's attach_path, iv's Range is added to a ``relax_set``. The root variables of the stage are evaluated with respect to this ``relax_set``. - -This is to handle cases like the following example, where C is not attached anywhere, but its consumer D is attached in stage E. In this case, D's attach_path, {ej, ei} must be considered when determining how much of C must be computed. - -:: - - C = tvm.compute((5, 16), lambda ci, cj : tvm.const(5, "int32"), name='C') - D = tvm.compute((5, 16), lambda di, dj : C[di, dj]*2, name='D') - E = tvm.compute((5, 16), lambda ei, ej : D[ei, ej]*4, name='E') - s = tvm.create_schedule(E.op) - s[D].compute_at(s[E], E.op.axis[1]) - - -:: - - for ci 0, 5 - for cj 0, 16 - C[ci, cj] = 5 - for ei 0, 5 - for ej 0, 16 - D[0] = C[ei, ej]*2 - E[ei, ej] = D[0]*4 - -Limitations of PassUpDomain -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -This section describes known limitations of PassUpDomain. These limitations affect the Ranges produced by InferBound, as well as other users of PassUpDomain such as ``tensorize``. - -**Ex. 6** - -Above, we discussed the behavior of PassUpDomain on Split relations only. In the following example, the schedule contains ``fuse`` in addition to ``split``. In the TVM program below, the operation C has two axes that are fused, and then the fused axis is split. Note that all tensors are originally of shape ``(4, 4)`` and the fused axis is split by factor ``4`` as well. Therefore, it would be natural to assume that the effect of the fuse is simply undone by the split. However, this is not the case in TVM, as explained below. - -:: - - import tvm - from tvm import te - - n = 4 - m = 4 - - A = te.placeholder((n, m), name='A') - B = te.compute((n, m), lambda bi, bj: A[bi, bj]+2, name='B') - C = te.compute((n, m), lambda ci, cj: B[ci, cj]*3, name='C') - - s = te.create_schedule(C.op) - - fused_axes = s[C].fuse(C.op.axis[0], C.op.axis[1]) - xo, xi = s[C].split(fused_axes, 4) - - s[B].compute_at(s[C], xo) - - print(tvm.lower(s, [A, C], simple_mode=True)) - -The output of this program is shown below. Notice that all 16 elements of B are computed every time through the outer loop, even though C only uses 4 of them. - -:: - - // attr [B] storage_scope = "global" - allocate B[float32 * 16] - produce C { - for (ci.cj.fused.outer, 0, 4) { - produce B { - for (bi, 0, 4) { - for (bj, 0, 4) { - B[((bi*4) + bj)] = (A[((bi*4) + bj)] + 2.000000f) - } - } - } - for (ci.cj.fused.inner, 0, 4) { - C[((ci.cj.fused.outer*4) + ci.cj.fused.inner)] = (B[((ci.cj.fused.outer*4) + ci.cj.fused.inner)]*3.000000f) - } - } - } - -This is in contrast to the following IR, which is produced by modifying the above program by deleting the fuse and split, and replacing the compute_at with ``s[B].compute_at(s[C], C.op.axis[0])``. Note that in the IR below, only 4 elements of B are computed at a time, as desired. The size of buffer B is also smaller. - -:: - - // attr [B] storage_scope = "global" - allocate B[float32 * 4] - produce C { - for (ci, 0, 4) { - produce B { - for (bj, 0, 4) { - B[bj] = (A[((ci*4) + bj)] + 2.000000f) - } - } - for (cj, 0, 4) { - C[((ci*4) + cj)] = (B[cj]*3.000000f) - } - } - } - -This example demonstrates that contrary to what we expect, the split does not simply undo the fuse. So what causes the difference? Why is the entire tensor B re-computed 4 times, when only a single row is actually needed at a time? - -Determining the amount of B that must be computed is the responsibility of InferBound. However, the Ranges returned by InferBound for B's root_iter_vars are too large in this case: ``[0, 4]`` for both ``bi`` and ``bj``. This occurs because of a limitation in PassUpDomain on Fuse relations, which we explain next. - -When InferRootBound is working on stage B, it visits B's consumer stage C to find out how much of B is requested by C. C has root_iter_vars ci and cj, which have been fused and then split. This results in the following :ref:`IterVarHyperGraph` for stage C. - - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/passupdomain_problem.png - :align: center - - - -We trace the execution of InferRootBound on stage B. Recall that :ref:`Phase1CA` of InferRootBound involves setting the IntSets for all leaf_iter_vars of B's consumer stage C. In this case, C's leaf_iter_vars are ``ci.cj.fused.outer`` and ``ci.cj.fused.inner``. Since B is attached at ``ci.cj.fused.outer``, ``ci.cj.fused.inner`` must be relaxed but ``ci.cj.fused.outer`` is a single point. The IntSets of C's leaf_iter_vars, after :ref:`Phase1CA`, are shown in the following table. - -+----------------------+---------------------------------------------------+ -| IterVar | IntSet after Phase 1 | -+======================+===================================================+ -| ``ci.cj.fused.inner``|``[0, (min(4, (16 - (ci.cj.fused.outer*4))) - 1)]``| -+----------------------+---------------------------------------------------+ -| ``ci.cj.fused.outer``| ``[ci.cj.fused.outer, ci.cj.fused.outer]`` | -+----------------------+---------------------------------------------------+ - -In :ref:`Phase2CA` of InferRootBound, PassUpDomain is called on all of C's IterVarRelations in bottom-up order. - -PassUpDomain is called on C's Split node first. Case 2 of PassUpDomain applies, because the IntSet of ``ci.cj.fused.outer`` is just a single point, and doesn't equal its Range (as previously computed by InferBound on stage C). PassUpDomain therefore sets the IntSet of ``ci.cj.fused`` based on the IntSets of ``ci.cj.fused.inner`` and ``ci.cj.fused.outer``, as shown in row 3 of the following table. - -+----------------------+--------------------------------------------------------------------------------------------------+ -| IterVar | IntSet after PassUpDomain on SplitNode | -+======================+==================================================================================================+ -| ``ci.cj.fused.inner``| ``[0, (min(4, (16 - (ci.cj.fused.outer*4))) - 1)]`` | -+----------------------+--------------------------------------------------------------------------------------------------+ -| ``ci.cj.fused.outer``| ``[ci.cj.fused.outer, ci.cj.fused.outer]`` | -+----------------------+--------------------------------------------------------------------------------------------------+ -| ``ci.cj.fused`` | ``[(ci.cj.fused.outer*4), ((ci.cj.fused.outer*4) + (min(4, (16 - (ci.cj.fused.outer*4))) - 1))]``| -+----------------------+--------------------------------------------------------------------------------------------------+ - -After PassUpDomain is called on the Split node, it is called on the Fuse node. - -- Case 1: the Range of IterVar ``fused`` (i.e., as previously calculated by InferBound) is equal to its IntSet -- Case 2: the IntSet of IterVar ``fused`` is a single point -- Case 3: otherwise - -In our case, the Range of ``ci.cj.fused``, is [0, 16). This is not equal to the IntSet of ``ci.cj.fused``, which has extent at most 4 (see row 3 of the table above). Therefore Case 1 does not apply. Case 2 doesn't apply either, since the IntSet of ``ci.cj.fused`` is not a single point. Therefore, only the default Case 3 applies. - -Unfortunately in Case 3, PassUpDomain conservatively applies a "fallback inference rule", i.e., it just returns IntSets equal to the Ranges of ``ci`` and ``cj``. Since C is the output stage of the schedule, we know that InferBound will have set the Ranges of the root_iter_vars of C (i.e., ``ci`` and ``cj``) to their original dimensions (i.e., the ``dom`` value of their IterVars). The resulting output of PassUpDomain for ``ci`` and ``cj`` is shown in the last two rows of the table below. - -+----------------------+--------------------------------------------------------------------------------------------------+ -| IterVar | IntSet after PassUpDomain on FuseNode | -+======================+==================================================================================================+ -| ``ci.cj.fused.inner``| ``[0, (min(4, (16 - (ci.cj.fused.outer*4))) - 1)]`` | -+----------------------+--------------------------------------------------------------------------------------------------+ -| ``ci.cj.fused.outer``| ``[ci.cj.fused.outer, ci.cj.fused.outer]`` | -+----------------------+--------------------------------------------------------------------------------------------------+ -| ``ci.cj.fused`` |``[(ci.cj.fused.outer*4), ((ci.cj.fused.outer*4) + (min(4, (16 - (ci.cj.fused.outer*4))) - 1))]`` | -+----------------------+--------------------------------------------------------------------------------------------------+ -| ``ci`` | ``[0, 4]`` | -+----------------------+--------------------------------------------------------------------------------------------------+ -| ``cj`` | ``[0, 4]`` | -+----------------------+--------------------------------------------------------------------------------------------------+ - -This is enough to guarantee that consumer C requests *all* elements of B: the IntSets of ``ci`` and ``cj`` become requests from consumer C to the output tensors of stage B (via PropBoundToInputs in :ref:`Phase3` and GatherBound in :ref:`Phase4`). - -This example shows that schedules containing a split of fused axes are difficult to handle in TVM. The source of the difficulty is similar to the limitations of GatherBound. The region of tensor B requested by a consumer C must be a single rectangular region of B. Or, if B has more than two dimensions, the region of B must be expressible as an independent Range for each of its axes. - -If the split factor is 4, or 8, in the above example, the region of B needed in each iteration of the outer loop is rectangular. - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/passupdomain_div.png - :align: center - -However, if the split factor is changed from 4 to 3 in the example above, it is easy to see that the region of B that C needs can no longer be described by an independent Range for each of its axes. - - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/passupdomain_nodiv.png - :align: center - -The best that can be done with rectangular regions is shown in the following diagram. The orange regions are the minimum rectangular regions covering the region of B that needs to be computed, at each iteration of the outer loop. - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/docs/inferbound/passupdomain_min.png - :align: center diff --git a/docs/arch/microtvm_design.rst b/docs/arch/microtvm_design.rst deleted file mode 100644 index f9c06c10b677..000000000000 --- a/docs/arch/microtvm_design.rst +++ /dev/null @@ -1,357 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at -.. http://www.apache.org/licenses/LICENSE-2.0 -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _microtvm-design: - -************************** -microTVM Design Document -************************** - -.. contents:: Table of Contents - :depth: 3 - -Background -=========== - -TVM is a model deployment framework that has demonstrated good performance across a wide range of -models on traditional operating systems. Given TVM's layered approach to compilation, it is a -natural extension to target bare metal devices. While most of the compilation flow does not need to -change for a proof-of-concept implementation on such devices, the runtime cannot depend on: - -* **Virtual Memory**, and by extension any system-provided ``malloc``. Additionally, bare metal - devices typically have very limited memory (measured in KB). Because of this, libraries designed - for such platforms typically need to be more judicious in using memory, and need to release - memory when it is not in use. -* Traditional OS abstractions, such as **files**, **libraries**, and **kernel functions**. Some - projects implement support for these, but they are by no means standard. -* Support for programming languages other than **C**. - -Such changes require a different approach from the TVM C++ runtime typically used on traditional -Operating Systems. - -Typical Use -=========== - -This section discusses our vision of the "typical" microTVM use case. Each component used to achieve -this typical use case is intended to be designed for flexibility, but this unifying vision serves to -motivate the inclusion of each part of the design. - -.. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_workflow.svg - :align: center - :width: 85% - -The parts of this process are described below: - -#. **Model Import**. The user imports an existing model or describes a new model to TVM, producing a - *Relay module*. - -#. **Model Transformations**. The user can apply transformations, such as quantization, to the - model. After each transformation, the user should still have a Relay module. - -#. **Compilation** (Scheduling and Code Generation). TVM implements each operator into Tensor IR by - assigning a schedule and schedule configuration to each Relay operator. Then, code (C source or - compiled object) is generated for each operator. - -#. **Integration**. The generated code is integrated along with the TVM C Runtime library into a - user-supplied binary project. In some cases (such as when the project is standardized across - multiple SoC/development boards), this process is handled automatically. - -#. **Deployment**. The project is built and the residual firmware binary is flashed onto the device. - Model inference is driven either by TVM using an on-device RPC server, or on the device using the - on-device Graph Executor. - -Design Goals -============ - -microTVM aims to achieve these design goals: - -1. **Portable Code**. microTVM can translate any Relay model into C code that can compile with only - a C standard library. -2. **Minimal Overhead**. microTVM generates target-specific, highly optimized code. As much overhead - from the runtime should be removed. -3. **Accessible Code**. microTVM considers C source code as a first-class output mechanism so that - it is easier for a firmware engineer to understand and tweak. - -Overview -======== - -microTVM requires changes at all levels of the TVM compiler stack. The following sub-sections enumerate -these changes at a high level, and follow-on sections discuss the specifics in more detail. - -Modeling Target Platforms -------------------------- - -TVM's search-based optimization approach allows it to largely avoid system-level modeling of targets -in favor of experimental results. However, some modeling is necessary in order to ensure TVM is -comparing apples-to-apples search results, and to avoid wasting time during the search by attempting -to compile invalid code for a target. - -microTVM models these parts of the target: - -* The CPU used, through the ``-mcpu`` and ``-march`` target flags. -* The presence or absence of accelerators, through the device components of the target (Currently - only the absence of accelerators can be expressed, but this mechanism should extend well). - -microTVM aims to model these parts of the target in the future: - -* Memory, modeled as a set of disjoint memory spaces, each with a label and size and prefetch/flush - behavior. Some memory may be shared with accelerators. -* Target runtime configuration (i.e. clock tree configuration, clock speed, etc). This is intended - only to contribute to the AutoTVM schedule key and not for any other use. - -At this time, TVM does not intend to model: - -* Size, type, or relationship of caches, with the exception of prefetching or cache flushing. - - -TVM Targets for microTVM -------------------------- - -A central data structure in the compilation process is the ``tvm::target::Target`` class. TVM uses -Target to decide which TIR schedules to enable and how to configure the code generator. The Target -class should also uniquely identify the generated code for a particular operator, as autotuning -logs use it to rank measured performance (but see Future Work). - -Targets are currently represented as strings structured similarly to command-line arguments. An -example target is shown below: - - ``c -keys=arm_cpu -mcpu=cortex-m7 -model=stm32f746xx`` - -The relevant parts to microTVM are: - - * Code generator (``llvm`` or ``c``) - * ``-mcpu=cortex-m7``: used by TOPI to enable Cortex-M schedules, and, when the C source code - generator is selected, included in the output as a comment to help identify the code and - configure the downstream C compiler. - -Runtime and Executor configuration for microTVM ------------------------------------------------ - -When using microTVM, it's important to use the C Runtime (``Runtime('crt')``), which is the runtime that works best on micro devices rather than the more dynamic C++ Runtime. Alongside this, there are two executors which you could use in combination with the C runtime: - -* ``Executor("aot")`` - The Ahead of Time (AOT) executor precompiles the network into a runnable function which you can add directly into your micro application -* ``Executor("graph", {"link-params": True})`` - The Graph executor provides a JSON representation of your network and requires the C Runtime's system library to be generated to find functions in the function registry (``Runtime("crt", {"system-lib": True})``). ``{"link-params":True}`` enables parameters to be linked into the generated files rather than provided externally. - -These are specified when building a runtime module: ``relay.build(..., runtime=..., executor=...)``. - -Writing Schedules for microTVM ------------------------------- - -For operations scheduled on the CPU, microTVM initially plans to make use of specialized -instructions and extern (i.e. hand-optimized) functions to achieve good performance. In TVM, this -approach is generally accomplished through tensorization, in which TVM breaks a computation into -small pieces, and a TIR extern function accelerates each small piece. - -TVM currently accommodates both approaches using ``tir.call_extern``. First, a pragma is attached to -the schedule defining the extern function in portable C. - - ``sched[output].pragma(n, "import_c", "void call_asm(int32_t* a, int32_t* b) { /* ... */ }")`` - -Next, ``tensorize`` is used to split the computation. - - ``sched[output].tensorize(owi, gemm)`` - -There are a couple of caveats to this approach, all which could be resolved by linking generated -code against external libraries: - -* Inline assembly is compiler-specific. While Clang and GCC have standardized on one syntax, this - may not be portable to other compilers. SDKs solve this by conditionally including a header file - depending on the compiler being used. However, taking this approach means that the generated code - needs additional compiler flags (i.e. ``-Isystempath/to/header``). -* It may be helpful to reference helper functions from the generated code (e.g. to inline common - sequences of hand-optimized assembly). -* Finally, the extern function invoked may be wholly written in an external library. If those - functions can be wholly inlined, this caveat is the same as the previous. If not, then additional - C code needs to be compiled and linked against the operator. - -At present, microTVM presumes that all eligible schedules can be compiled. This means that the user- -supplied project (see next section) must include all libraries that are used by the generated code. -When not using autotuning, TVM randomly chooses a fallback schedule, so all libraries would need to -be supported. When using autotuning, TVM selects the best-performing schedule, so only that library -is needed. There isn't currently a way to force TVM to pick a particular schedule outside of -autotuning logs, but that would be a good addition. - -Finally, when using the ``llvm`` backend, the process is similar except that LLVM bitcode is included -in the generated code (with an ``import_llvm`` pragma). LLVM bitcode provides a portable way to call -inline assembly. However, it may be more complex to call external C functions, and helper functions -are of course not easy to use from LLVM bitcode. - -Executing Models ----------------- - -The TVM compiler traditionally outputs three pieces: - -1. Model operator implementations, as discussed above; -2. A model execution graph, encoded as JSON; and -3. Simplified parameters. - -To correctly execute the model, a Graph Executor needs to reconstruct the graph in memory, load the -parameters, and then invoke the operator implementations in the correct order. - -microTVM supports two ways to do this: - -1. **Host-Driven**. The Graph Executor can run on the host and carry out execution by issuing - commands to the device using an RPC link with a UART-like transport. -2. **Standalone**. A C Graph Executor is available to be compiled on-device, but it is not - particularly memory efficient. This way enables standalone execution without any attached host. - -Host-Driven is designed for experimenting with models on-device and, like AutoTVM, uses the RPC server to -drive computation on-device. Standalone is intended for deployment. - -Host-Driven Execution -^^^^^^^^^^^^^^^^^^^^^ - -In Host-Driven execution, the firmware binary is the following: - -1. Generated operator implementations from TVM. -2. The TVM C runtime. -3. SoC-specific initialization. -4. The TVM RPC server. -5. (optional) Simplified Parameters. - -This firmware image is flashed onto the device and a GraphExecutor instance is created on the host. -The GraphExecutor drives execution by sending RPC commands over a UART: - -.. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_host_driven.svg - :align: center - :width: 85% - -Standalone Execution -^^^^^^^^^^^^^^^^^^^^ - -In Standalone execution, the GraphExecutor is instantiated on device: - -.. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_standalone.svg - :align: center - :width: 85% - -microTVM Firmware ------------------- - -We can now discuss how microTVM firmware should behave. An important task common to both model -execution strategies is configuring the SoC to match the way it performs in production. microTVM -considers this task project- and SoC-dependent. Whether for AutoTVM, host-driven model inference, or -in standalone deployment, the user is expected to supply a project whose main() does the following: - -1. Configure the SoC to match deployment performance. -2. Initialize the TVM C Runtime. - -When configuring for host-driven inference or AutoTVM, the remaining tasks are well-defined: - -3. Initialize a transport (i.e. a UART) for use with the TVM RPC server. -4. Launch the TVM RPC Server. - -When configuring for standalone deployment, the firmware needs to: - -1. Instantiate the system library by calling the ``runtime.SystemLib`` PackedFunc. -2. Instantiate a GraphExecutor passing the system library module. -3. Configure parameters and inputs as needed. -4. Run the model. - -Parts of a microTVM Binary --------------------------- - -To summarize, a microTVM firwmare binary image must contain these parts: - -1. Operator implementations, produced by TVM. -2. The TVM C runtime library, supplied by TVM as a static library. -3. SoC Initialization, supplied by the user. - -For Host-driven model execution, firmware also needs: - -4. The TVM RPC Server library. - -For Standalone model execution, firmware also needs: - -4. The TVM C GraphExecutor library, supplied by TVM as a static library. -5. The remaining compiler outputs (Simplified Parameters and Graph JSON). - -The Automated Build Flow ------------------------- - -Once code generation is complete, ``tvm.relay.build`` returns a ``tvm.runtime.Module`` and the -user can save the generated C source or binary object to a ``.c`` or ``.o`` file. From this point, TVM -can theoretically step back and the user can compile and run the code separately. - -However, for AutoTVM, TVM needs some automated flow to handle the following tasks: - -1. Integrate operator implementations, the TVM C Runtime library, and the TVM RPC Server library into the - firmware project containing user-supplied SoC Initialization. -2. Build the resulting project. -3. Program the built firmware onto a (specific) attached device. -4. Identify the serial port or other transport to be used by TVM to drive remote execution. - -At present, TVM expects the user to supply an implementation of the ``tvm.micro.Compiler``, -``tvm.micro.Flasher``, and ``tvm.micro.Transport`` interfaces. TVM then: - -1. Builds each piece separately as a library. -2. Builds the libraries into a binary firmware image. -3. Programs the firmware image onto an attached device. -4. Opens a serial port to serve as the RPC server transport. - -This design was chosen to reduce build times for microTVM (the common libraries need to be built -only once per candidate operator implemmentation). In practice, these projects are extremely small -and compile relatively quickly. Compared with the added complexity of this tighter build integration -with TVM, the performance gains are likely not worth it. A future design will consolidate the build -tasks into a single step and narrow the interface to provide a better integration. - -Measuring operator performance ------------------------------- - -The TVM C runtime depends on user-supplied functions to measure time on-device. Users should implement -``TVMPlatformTimerStart`` and ``TVMPlatformTimerStop``. These functions should measure wall clock time, so there -are some pitfalls in implementing these functions: - -1. If the CPU could halt or sleep during a computation (i.e. if it is being done on an accelerator), - a cycle counter should likely not be used as these tend to stop counting while the CPU is asleep. -2. The granularity of these functions can be relaxed as needed to extend the range of the timer - device. However, if granularity is too coarse, a sub-optimal schedule may be used. -3. An error should be raised if the timer overflows. -4. The timer should not interrupt computation unless absolutely necessary. Doing so may affect the - accuracy of the results. -5. Calibrating the output against a wall clock is ideal, but it will likely be too cumbersome. A - future PR could enable some characterization of the platform timer by, e.g., measuring the internal - oscillator against a reference such as an external crystal. - -Future Work -=========== - -Ahead-of-Time Runtime ----------------------- - -A limitation of the Graph Executor is the amount of memory overhead required in parsing the JSON. -The current implementation contributes significantly to the dynamic memory usage of microTVM, -limiting its utility. An ahead-of-time runtime can avoid the need for any Graph JSON parsing and -improve inference speed by generating C code to call the generated operator implementations directly -rather than relying on a data-driven approach with the Graph Executor. - -Memory Planning ----------------- - -The current memory planner attempts to limit the number of ``TVMBackendDeviceAlloc()`` calls -issued for intermediate tensors only. Because scratchpads can vary widely, and because the planner -coalesces memory allocations within 16x of each other, this strategy typically results in high -peak memory usage. - -Heterogeneous Execution ------------------------ - -Newer Cortex-M SoCs can contain multiple CPUs and onboard ML accelerators. - - -Autotuning Target ------------------ - -As discussed previously, diff --git a/docs/arch/microtvm_project_api.rst b/docs/arch/microtvm_project_api.rst deleted file mode 100644 index 381b57876aaa..000000000000 --- a/docs/arch/microtvm_project_api.rst +++ /dev/null @@ -1,150 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _microtvm_project_api: - -microTVM Project API -==================== - -About microTVM Project API --------------------------- - -The microTVM Project API allows TVM to automatically run models on -unconventional or embedded platforms. It allows platforms to define a standard -function to integrate TVM compiler output with boilerplate platform-specific -code, producing a runnable **Project**. Project API then further defines -functions to build that project, program compatible devices accessible from the -TVM machine, and communicate with the running code so that TVM can perform -host-driven inference and autotuning. - -There are many cases where it might be desirable simply to invoke microTVM as a -tool from your platform's build process. Indeed, for the average firmware -developer, this is likely to be all they need. However, there are a couple of -use cases when you may want to teach microTVM how to build firmware using your -platform's build tool: - -1. To enable AutoTVM and AutoScheduling on your platform. Defining a Project - API implementation allows TVM to tune models for peak performance on your - platform. -2. To enable engineers without firmware expertise to experiment with models on - your platform. Defining a Project API implementation allows these engineers - to leverage the standard TVM Python workflows to perform host-driven - inference on your platform. -3. Integration Testing. Defining a Project API implementation allows you to - create Continuous Integration Tests which verify model correctness and - performance on your platform. - -API Definition --------------- - -The full API is the ``abstractmethod`` defined on ``ProjectAPIHandler`` in -`python/tvm/micro/project_api/server.py `_. -Rather than duplicate the documentation here, we simply refer you to that class. - -How TVM uses Project API ------------------------- - -This section explains how the Project API should be used with TVM. Project API -is defined around the *Project* as the buildable unit of firmware. TVM expects -to be provided initially with a directory containing a *Template Project*, which -together with a :ref:`Model Library Format ` file can be -built into a runnable project. - -Inside the Template Directory is (typically) a Python script implementing the -API server. TVM launches this script in a subprocess and sends commands to the -server to perform each of the actions outlined above. - -The typical usage flow is as follows: - -1. Launch Project API server in Template Project. -2. Verify the API server is version-compatible with TVM, plus read properties - of the implementation, by sending ``server_info_query`` command. -3. Generate a new project by sending command ``generate_project`` to create a - new project. The arguments to this command is a Model Library Format and a - non-existent directory which should be populated with the generated - project. The Template Project API server should copy itself into the - newly-generated project. -4. Terminate the Template Project API server. -5. Launch Project API server in Generated Project. -6. Verify the API server is version-compatible with TVM, plus read properties - of the implementation, by sending ``server_info_query`` command. -7. Build and flash the projec by sending commands ``build`` and ``flash`` to the - API server. -8. Communicate with the target. Send command ``open_transport`` followed by - commands ``write_transport`` and ``read_transport`` to write and read from - e.g. a serial port attached to the target. Upon completion, - ``close_transport`` is sent. -9. Terminate Project API server. - -Disk Layout of the Project --------------------------- - -In the root directory of a project (template or generated), one of the following -two files must exist: - -- ``microtvm_api_server.py`` - the suggested approach. Place a - python3-compatible Python script in the root directory. TVM will execute this - script in its own process using the same interpreter used to execute TVM. -- ``microtvm_api_server.sh`` (on Windows, ``microtvm_api_server.bat``) - - alternate approach. When a different Python interpreter is necessary, or - when you want to implement the server in a different language, create this - executable file. TVM will launch this file in a separate process. - -Aside from these two files, no other restrictions are made on the layout. - -Communication between TVM and Project API Server ------------------------------------------------- - -TVM communicates with the Project API server using `JSON-RPC 2.0 -`_. TVM always launches API servers using -the following command-line: - -``microtvm_api_server.py --read-fd --write-fd `` - -Commands are sent from TVM to the server over the file descriptor given by -``--read-fd`` and replies are received by TVM from the server over the file -descriptor given by ``--write-fd``. - -Helpers for Implementing the API server in Python -------------------------------------------------- - -TVM provides helper utilities that make it easy to implement the server in Python. -To implement the server in Python, create ``microtvm_api_server.py`` and add -``from tvm.micro.project_api import server`` (or, copy this file into your template -project--there are no dependencies--and import it there instead). Next, subclass -``ProjectAPIHander``:: - - class Handler(server.ProjectAPIHandler): - def server_info_query(self, tvm_version): - # Implement server_info_query - - def generate_project(self, model_library_format_path, standalone_crt_dir, project_dir, options): - # Implement generate_project - - # ... - -Finally, invoke the helper ``main()``:: - - if __name__ == "__main__": - server.main(Handler()) - -Using Project API from ``tvmc`` -------------------------------- - -Each major Project API command is available through the ``tvmc micro`` -sub-command to make debugging interactions simple. Invoke ``tvmc micro --help`` -for more information. diff --git a/docs/arch/model_library_format.rst b/docs/arch/model_library_format.rst deleted file mode 100644 index 3ee6b9878f3f..000000000000 --- a/docs/arch/model_library_format.rst +++ /dev/null @@ -1,171 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _model_library_format: - -Model Library Format -==================== - -About Model Library Format --------------------------- - -TVM traditionally exports generated libraries as Dynamic Shared Objects (e.g. DLLs (Windows) or .so -(linux)). Inferences can be performed using those libraries by loading them into an executable using -``libtvm_runtime.so``. This process is very dependent on services provided by traditional OS. - -For deployment to unconventional platforms (e.g. those lacking traditional OS), TVM provides another -output format, Model Library Format. Initially, the microTVM project is the primary use case for this -format. Should it become useful in other use cases (and in particular, should it become possible to -export BYOC artifacts in Model Library Format), it could be used as a general-purpose TVM export -format. Model Library Format is a tarball containing a file for each piece of the TVM compiler -output. - -What can be Exported? ---------------------- - -At the time of writing, export is limited to full models built with ``tvm.relay.build``. - -Directory Layout ----------------- - -Model Library Format is contained within a tarball. All paths are relative to the root of the -tarball: - -- ``/`` - Root of the tarball - - - ``codegen`` - Root directory for all generated device code - - - (see `codegen`_ section) - - - ``executor-config/`` - Configuration for the executor which drives model inference - - - ``graph/`` - Root directory containing configuration for the GraphExecutor - - - ``graph.json`` - GraphExecutor JSON configuration - - - ``metadata.json`` - Machine-parseable metadata for this model - - - ``parameters/`` - Root directory where simplified parameters are placed - - - ``.params`` - Parameters for the model tvm.relay._save_params format - - - ``src/`` - Root directory for all source code consumed by TVM - - - ``relay.txt`` - Relay source code for the generated model - -Description of Sub-directories ------------------------------- - -.. _subdir_codegen: - -``codegen`` -^^^^^^^^^^^ - -All TVM-generated code is placed in this directory. At the time of writing, there is 1 file per -Module in the generated Module tree, though this restriction may change in the future. Files in -this directory should have filenames of the form ``/(lib|src)/.``. - -These components are described below: - - * ```` - Identifies the TVM target on which the code should run. Currently, only ``host`` - is supported. - * ```` - A unique slug identifying this file. Currently ``lib``, with ``>`` an - auto-incrementing integer. - * ```` - Suffix identifying the filename format. Currently ``c`` or ``o``. - -An example directory tree for a CPU-only model is shown below: - -- ``codegen/`` - Codegen directory - - - ``host/`` - Generated code for ``target_host`` - - - ``lib/`` - Generated binary object files - - - ``lib0.o`` - LLVM module (if ``llvm`` target is used) - - ``lib1.o`` - LLVM CRT Metadata Module (if ``llvm`` target is used) - - - ``src/`` - Generated C source - - - ``lib0.c`` - C module (if ``c`` target is used) - - ``lib1.c`` - C CRT Metadata module (if ``c`` target is used) - -``executor-config`` -^^^^^^^^^^^^^^^^^^^ - -Contains machine-parsable configuration for executors which can drive model inference. Currently, -only the GraphExecutor produces configuration for this directory, in ``graph/graph.json``. This -file should be read in and the resulting string supplied to the ``GraphExecutor()`` constructor for -parsing. - -``parameters`` -^^^^^^^^^^^^^^ - -Contains machine-parseable parameters. A variety of formats may be provided, but at present, only -the format produced by ``tvm.relay._save_params`` is supplied. When building with -``tvm.relay.build``, the ``name`` parameter is considered to be the model name. A single file is -created in this directory ``.json``. - -``src`` -^^^^^^^ - -Contains source code parsed by TVM. Currently, just the Relay source code is created in -``src/relay.txt``. - -Metadata --------- - -Machine-parseable metadata is placed in a file ``metadata.json`` at the root of the tarball. -Metadata is a dictionary with these keys: - -- ``export_datetime``: Timestamp when this Model Library Format was generated, in - `strftime `_ - format ``"%Y-%M-%d %H:%M:%SZ",``. -- ``memory``: A summary of the memory usage of each generated function. Documented in - `Memory Usage Summary`_. -- ``model_name``: The name of this model (e.g. the ``name`` parameter supplied to - ``tvm.relay.build``). -- ``executors``: A list of executors supported by this model. Currently, this list is always - ``["graph"]``. -- ``target``: A dictionary mapping ``device_type`` (the underlying integer, as a string) to the - sub-target which describes that relay backend used for that ``device_type``. -- ``version``: A numeric version number that identifies the format used in this Model Library - Format. This number is incremented when the metadata structure or on-disk structure changes. - This document reflects version ``5``. - -Memory Usage Summary -^^^^^^^^^^^^^^^^^^^^ - -A dictionary with these sub-keys: - - - ``"main"``: ``list[MainFunctionWorkspaceUsage]``. A list summarizing memory usage for each - workspace used by the main function and all sub-functions invoked. - - ``"operator_functions"``: ``map[string, list[FunctionWorkspaceUsage]]``. Maps operator function - name to a list summarizing memory usage for each workpace used by the function. - -A ``MainFunctionWorkspaceUsage`` is a dict with these keys: - -- ``"device"``: ``int``. The ``device_type`` associated with this workspace. -- ``"workspace_size_bytes"``: ``int``. Number of bytes needed in this workspace by this function - and all sub-functions invoked. -- ``"constants_size_bytes"``: ``int``. Size of the constants used by the main function. -- ``"io_size_bytes"``: ``int``. Sum of the sizes of the buffers used from this workspace by this - function and sub-functions. - -A ``FunctionWorkspaceUsage`` is a dict with these keys: - -- ``"device"``: ``int``. The ``device_type`` associated with this workspace. -- ``"workspace_size_bytes"``: ``int``. Number of bytes needed in this workspace by this function. diff --git a/docs/arch/relay_intro.rst b/docs/arch/relay_intro.rst deleted file mode 100644 index 87f68fcbce2e..000000000000 --- a/docs/arch/relay_intro.rst +++ /dev/null @@ -1,206 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _relay-dev-intro: - -Introduction to Relay IR -======================== -This article introduces Relay IR -- the second generation of NNVM. -We expect readers from two kinds of background -- those who have a programming language background and deep learning -framework developers who are familiar with the computational graph representation. - -We briefly summarize the design goal here, and will touch upon these points in the later part of the article. - -- Support traditional data flow-style programming and transformations. -- Support functional-style scoping, let-binding and making it a fully featured differentiable language. -- Being able to allow the user to mix the two programming styles. - -Build a Computational Graph with Relay --------------------------------------- -Traditional deep learning frameworks use computational graphs as their intermediate representation. -A computational graph (or dataflow graph), is a directed acyclic graph (DAG) that represents the computation. -Though dataflow graphs are limited in terms of the computations they are capable of expressing due to -lacking control flow, their simplicity makes it easier to implement automatic differentiation and -compile for heterogeneous execution environments (e.g., executing parts of the graph on specialized hardware). - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/relay/dataflow.png - :align: center - - -You can use Relay to build a computational (dataflow) graph. Specifically, the above code shows how to -construct a simple two-node graph. You can find that the syntax of the example is not that different from existing -computational graph IR like NNVMv1, with the only difference in terms of terminology: - -- Existing frameworks usually use graph and subgraph -- Relay uses function e.g. -- ``fn (%x)``, to indicate the graph - -Each dataflow node is a CallNode in Relay. The Relay Python DSL allows you to construct a dataflow graph quickly. -One thing we want to highlight in the above code -- is that we explicitly constructed an Add node with -both input point to ``%1``. When a deep learning framework evaluates the above program, it will compute -the nodes in topological order, and ``%1`` will only be computed once. -While this fact is very natural to deep learning framework builders, it is something that might -surprise a PL researcher in the first place. If we implement a simple visitor to print out the result and -treat the result as nested Call expression, it becomes ``log(%x) + log(%x)``. - -Such ambiguity is caused by different interpretations of program semantics when there is a shared node in the DAG. -In a normal functional programming IR, nested expressions are treated as expression trees, without considering the -fact that the ``%1`` is actually reused twice in ``%2``. - -The Relay IR is mindful of this difference. Usually, deep learning framework users build the computational -graph in this fashion, where a DAG node reuse often occurs. As a result, when we print out the Relay program in -the text format, we print one CallNode per line and assign a temporary id ``(%1, %2)`` to each CallNode so each common -node can be referenced in later parts of the program. - -Module: Support Multiple Functions (Graphs) -------------------------------------------- -So far we have introduced how can we build a dataflow graph as a function. One might naturally ask: Can we support multiple -functions and enable them to call each other? Relay allows grouping multiple functions together in a module; the code below -shows an example of a function calling another function. - -.. code:: - - def @muladd(%x, %y, %z) { - %1 = mul(%x, %y) - %2 = add(%1, %z) - %2 - } - def @myfunc(%x) { - %1 = @muladd(%x, 1, 2) - %2 = @muladd(%1, 2, 3) - %2 - } - -The Module can be viewed as a ``Map``. Here GlobalVar is just an id that is used to represent the functions -in the module. ``@muladd`` and ``@myfunc`` are GlobalVars in the above example. When a CallNode is used to call another function, -the corresponding GlobalVar is stored in the op field of the CallNode. It contains a level of indirection -- we need to look up -body of the called function from the module using the corresponding GlobalVar. In this particular case, we could also directly -store the reference to the Function as op in the CallNode. So, why do we need to introduce GlobalVar? The main reason is that -GlobalVar decouples the definition/declaration and enables recursion and delayed declaration of the function. - -.. code :: - - def @myfunc(%x) { - %1 = equal(%x, 1) - if (%1) { - %x - } else { - %2 = sub(%x, 1) - %3 = @myfunc(%2) - %4 = add(%3, %3) - %4 - } - } - -In the above example, ``@myfunc`` recursively calls itself. Using GlobalVar ``@myfunc`` to represent the function avoids -the cyclic dependency in the data structure. -At this point, we have introduced the basic concepts in Relay. Notably, Relay has the following improvements over NNVMv1: - -- Succinct text format that eases debugging of writing passes. -- First-class support for subgraphs-functions, in a joint module, this enables further chance of joint optimizations such as inlining and calling convention specification. -- Naive front-end language interop, for example, all the data structure can be visited in Python, which allows quick prototyping of optimizations in Python and mixing them with C++ code. - - -Let Binding and Scopes ----------------------- - -So far, we have introduced how to build a computational graph in the good old way used in deep learning frameworks. -This section will talk about a new important construct introduced by Relay -- let bindings. - -Let binding is used in every high-level programming language. In Relay, it is a data structure with three -fields ``Let(var, value, body)``. When we evaluate a let expression, we first evaluate the value part, assign -it to the var, then return the evaluated result in the body expression. - -You can use a sequence of let bindings to construct a logically equivalent program to a dataflow program. -The code example below shows one program with two forms side by side. - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/relay/dataflow_vs_func.png - :align: center - - -The nested let binding is called A-normal form, and it is commonly used as IRs in functional programming languages. -Now, please take a close look at the AST structure. While the two programs are semantically identical -(so are their textual representations, except that A-normal form has let prefix), their AST structures are different. - -Since program optimizations take these AST data structures and transform them, the two different structures will -affect the compiler code we are going to write. For example, if we want to detect a pattern ``add(log(x), y)``: - -- In the data-flow form, we can first access the add node, then directly look at its first argument to see if it is a log -- In the A-normal form, we cannot directly do the check anymore, because the first input to add is ``%v1`` -- we will need to keep a map from variable to its bound values and look up that map, in order to know that ``%v1`` is a log. - -Different data structures will impact how you might write transformations, and we need to keep that in mind. -So now, as a deep learning framework developer, you might ask, Why do we need let bindings? -Your PL friends will always tell you that let is important -- as PL is a quite established field, -there must be some wisdom behind that. - -Why We Might Need Let Binding ------------------------------ -One key usage of let binding is that it specifies the scope of computation. Let us take a look at the following example, -which does not use let bindings. - -.. image:: https://raw.githubusercontent.com/tvmai/tvmai.github.io/main/images/relay/let_scope.png - :align: center - -The problem comes when we try to decide where we should evaluate node ``%1``. In particular, while the text format seems -to suggest that we should evaluate node ``%1`` outside the if scope, the AST(as shown in the picture) does not suggest so. -Actually, a dataflow graph never defines its scope of the evaluation. This introduces some ambiguity in the semantics. - -This ambiguity becomes more interesting when we have closures. Consider the following program, which returns a closure. -We don’t know where should we compute ``%1``; it can be either inside or outside the closure. - -.. code:: - - fn (%x) { - %1 = log(%x) - %2 = fn(%y) { - add(%y, %1) - } - %2 - } - -A let binding solves this problem, as the computation of the value happens at the let node. In both programs, -if we change ``%1 = log(%x)`` to ``let %v1 = log(%x)``, we clearly specify the computation location to -be outside of the if scope and closure. As you can see let-binding gives a more precise specification of the computation site -and could be useful when we generate backend code (as such specification is in the IR). - -On the other hand, the dataflow form, which does not specify the scope of computation, does have its own advantages --- namely, we don’t need to worry about where to put the let when we generate the code. The dataflow form also gives more freedom -to the later passes to decide where to put the evaluation point. As a result, it might not be a bad idea to use data flow -form of the program in the initial phases of optimizations when you find it is convenient. -Many optimizations in Relay today are written to optimize dataflow programs. - -However, when we lower the IR to an actual runtime program, we need to be precise about the scope of computation. -In particular, we want to explicitly specify where the scope of computation should happen when we are using -sub-functions and closures. Let-binding can be used to solve this problem in later stage execution specific optimizations. - - -Implication on IR Transformations ---------------------------------- - -Hopefully, by now you are familiar with the two kinds of representations. -Most functional programming languages do their analysis in A-normal form, -where the analyzer does not need to be mindful that the expressions are DAGs. - -Relay choose to support both the dataflow form and let bindings. We believe that it is important to let the -framework developer choose the representation they are familiar with. -This does, however, have some implications on how we write passes: - -- If you come from a dataflow background and want to handle lets, keep a map of var to the expressions so you can perform lookup when encountering a var. This likely means a minimum change as we already need a map from expressions to transformed expressions anyway. Note that this will effectively remove all the lets in the program. -- If you come from a PL background and like A-normal form, we will provide a dataflow to A-normal form pass. -- For PL folks, when you are implementing something (like a dataflow-to-ANF transformation), be mindful that expressions can be DAGs, and this usually means that we should visit expressions with a ``Map`` and only compute the transformed result once, so the resulting expression keeps the common structure. - -There are additional advanced concepts such as symbolic shape inference, polymorphic functions -that are not covered by this material; you are more than welcome to look at other materials. diff --git a/docs/arch/relay_op_strategy.rst b/docs/arch/relay_op_strategy.rst deleted file mode 100644 index dbac7c821827..000000000000 --- a/docs/arch/relay_op_strategy.rst +++ /dev/null @@ -1,282 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -.. _relay-op-strategy: - -Relay Operator Strategy -======================= - -In order to lower Relay operators to the implementations defined in TOPI -library, a compute and schedule function need to be registered to each Relay -operator. However, compute and schedule functions are usually specialized for -each target, and further, even for the same target, we may have multiple -algorithms and implementations available. To deal with the complexity, we -introduce operator strategy to allow developers to define a flexible lowering -strategy for each operator and target. - - -Operator Strategy Design ------------------------- - -The basic element in operator strategy is an ``OpImplementation``. It includes -the a pair of compute and schedule function, the name of the implementation, -and a priority level (the use of priority level is explained in -`Select Implementation from Op Strategy`_). - -The ``OpStrategy`` includes a list of ``OpSpecialization``. Each ``OpSpecialization`` -contains a list of ``OpImplementation`` associated with a ``SpecializedCondition`` -(see definition in ``include/tvm/te/schedule.h``). The ``SpecializedCondition`` -can be null, indicating the implementations are generally applicable; -otherwise, the implementations are only considered when the specialized -condition is satisfied. ``SpecializedCondition`` consists of a list -of clauses defined in Tensor Expression in conjunctive normal form (CNF) and -only supports conditions on tensor shapes. - -Last, a strategy function, or ``FTVMStrategy``, determines which pair(s) of -compute and schedule functions should be used given a workload, and needs to be -registered to each Relay operator. ``FTVMStrategy`` is a generic function (see -``include/tvm/target/generic_func.h``), that can be overwritten for each -target. The function signature is - -.. code:: c - - OpStrategy(const Attrs& attrs, const Array& inputs, const Type& out_type, const Target& target) - -that the function returns an ``OpStrategy`` given the op attributes, input -tensors, output types, and target to compile to. - - -Write A Strategy Function -------------------------- - -We recommend developers to write strategy function in Python as -most TOPI compute and schedule functions are written in Python. -In python, we provide ``OpStrategy`` class in ``pyton/tvm/relay/op/op.py``. -It only has one API, which is to add an implementation to the strategy: - -.. code:: python - - def add_implementation(self, compute, schedule, name="default", plevel=10) - - -We now take ``topk`` as an example to explain how to write the -``FTVMStrategy`` function: - -.. code:: python - - # add to python/tvm/relay/op/strategy/generic.py - @override_native_generic_func("topk_strategy") - def topk_strategy(attrs, inputs, out_type, target): - strategy = _op.OpStrategy() - strategy.add_implementation( - wrap_compute_topk(topi.topk), - wrap_topi_schedule(topi.generic.schedule_topk), - name="topk.generic") - return strategy - - # add to each target file in python/tvm/relay/op/strategy, e.g., x86.py, cuda.py, etc. - @topk_strategy.register(["cuda", "gpu"]) - def topk_strategy_cuda(attrs, inputs, out_type, target): - strategy = _op.OpStrategy() - strategy.add_implementation( - wrap_compute_my_new_op(topi.cuda.topk), - wrap_topi_schedule(topi.cuda.schedule_topk), - name="topk.cuda") - return strategy - -In this example, we use ``topi.cuda.topk`` and ``topi.cuda.schedule_topk`` -as the compute and schedule function for CUDA or GPU target, while use TOPI -generic compute and schedule for the rest of targets. -Note that we use two wrapper functions that wrap the topi -compute and schedule to conform with the required function signature ( -see ``FTVMCompute`` and ``FTVMSchedule`` in ``include/tvm/relay/op_attr_types.h``). -Usually we need to write a customized compute wrapper function for each operator -to get different fields from op attributes. - -The example above shows a very basic strategy function that only -adds one implementation in the strategy. But for many complicated operators, -we may need to add multiple implementations that use different algorithms. -For example, we can use both direct and winograd algorithm to -compute a conv2d op. In order to achieve this, we can write the strategy function -as follows: - -.. code:: python - - strategy.add_implementation( - wrap_compute_conv2d(topi.cuda.conv2d_nchw), - wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw), - name="conv2d_nchw.cuda", - plevel=10) - - if winograd_condition: - strategy.add_implementation( - wrap_compute_conv2d(topi.cuda.conv2d_nchw_winograd), - wrap_topi_schedule(topi.cuda.schedule_conv2d_nchw_winograd), - name="conv2d_nchw_winograd.cuda", - plevel=15) - -In this example, we add two implementations to the conv2d strategy where -winograd algorithm is only added when ``winograd_condition`` is true. -The implementation ``"conv2d_nchw_winograd.cuda"`` will be used to compile -conv2d when ``winograd_condition`` is true as it has higher -priority level (this could be changed if certain implementation is an AutoTVM -template. See `Select Implementation from Op Strategy`_ for more -details). Otherwise, ``"conv2d_nchw.cuda"`` is used. - -We can extend the example above to third party library implementation. For -example, we can add the implementation that invokes kernel in the cblas -library when cblas is included in the target. - -.. code:: python - - if "cblas" in target.libs: - strategy.add_implementation( - wrap_compute_dense(topi.x86.dense_cblas), - wrap_topi_schedule(topi.x86.schedule_dense_cblas), - name="dense_cblas.x86", - plevel=15) - - -Further, we can add implementation specialized for a certain range of shapes. -The code below shows an example of dense strategy that adds an implementation -that is specialized for ``m`` greater than 16. The main difference between -hardcode python condition like examples above and specialized condition is that -it allows TVM to generate multiple kernels when the input tensors have symbolic -shapes. The compile engine will generate a dispatch function that invokes the -specialized kernel when the corresponding condition is met; otherwise, -invoke the kernel that has no associated specialized condition (``dense_common`` -in this example). This part is still work in progress. More details will be -provided after it is done. - -.. code:: python - - def dense_strategy(attrs, inputs, out_type, target): - m = inputs[0].shape[0] - strategy = _op.OpStrategy() - strategy.add_implementation( - wrap_compute_dense(dense_compute1), - wrap_topi_schedule(dense_schedule1), - name="dense_common") - - with tvm.te.SpecializedCondition(m > 16): - strategy.add_implementation( - wrap_compute_dense(dense_compute2), - wrap_topi_schedule(dense_schedule2), - name="dense_for_large_m", - plevel=15) - - return strategy - - -Register Strategy Function to An Operator ------------------------------------------ - -After we define the strategy function for an operator, we can now -register the strategy function to this operator with - -.. code:: python - - register_strategy("topk", strategy.topk_strategy) - -However, it takes much effort to write a strategy function for an operator. -Therefore, we provide two other methods for simpler operators. - -First, for operators that have injective, broadcast, or reduction pattern, we -can call ``register_injective_schedule``, ``register_broadcast_schedule``, and -``register_reduce_schedule`` repsectively. The schedule function for these -patterns are already registered by each target and can be applied to these -operators. We assume the compute function should be the same across all targets, -and ``FTVMCompute`` needs to be registered to the op before invoking register -schedule. - -.. code:: python - - register_broadcast_schedule("add") - -Second, for operators that doesn't have these common patterns mentioned before, -but also have the same compute function for all targets, we can use -``register_schedule`` API. It is easier to write ``FTVMSchedule`` function -as we only need to provide which schedule function to use. The following -code snippet shows ``FTVMSchedule`` function for pooling. - -.. code:: python - - # add to python/tvm/relay/op/strategy/generic.py - @generic_func - def schedule_pool(attrs, outs, target): - with target: - return topi.generic.schedule_pool(outs, attrs.layout) - - # add to each target file in python/tvm/relay/op/strategy, e.g., x86.py, cuda.py, etc. - @schedule_pool.register("cpu") - def schedule_pool_cpu(attrs, outs, target): - ... - -After we created the ``FTVMSchedule`` for an operator, we can -register the strategy using ``register_schedule``: - -.. code:: python - - register_schedule("nn.max_pool2d", strategy.schedule_pool) - - -Register Strategies for A New Target ------------------------------------- - -There are two ways to register strategies for a new target. The more -straightforward one is adding a new target file in the directory -``python/tvm/relay/op/strategy``. You only need to customize the strategy for -ops that have been implemented for this new target and reuse the generic -strategies for the rest. - -Alternatively, you can also register the strategy for the new target outside the -TVM python library. The following code snippet shows an example how to do -so. You can find more examples in ``vta/python/vta/top/op.py``. - -.. code:: python - - @relay.op.strategy.conv2d_strategy.register("mytarget") - def conv2d_strategy_mytarget(attrs, inputs, out_type, target): - ... - - -Select Implementation from Op Strategy --------------------------------------- - -During the compilation, Relay compile engine needs to determine which -implementation to use for an operator when there are multiple. The selection -policy works as follows. - -When the input tensors to an operator or a fused op all have constant shapes, -the compile engine first finds the best implementation based on AutoTVM tuning -logs. If there is no implementation that is an AutoTVM template or all AutoTVM -templates have fallback configs, the implementation with highest priority level -will then be chosen. Implementations with same priority level in this case leads -to an undefined behavior, and any of them might be selected. - -The selection policy for ops with symbolic input shapes is still work in -progress. Currently, if any input tensor has a symbolic shape, only the -implementation with highest priority level will be used for this operator. This -will be updated after the implementation finishes. - -For debug purpose, you can add the following lines before you compile the Relay -model to learn which implementation is used for each operator. - -.. code:: python - - logging.getLogger("te_compiler").setLevel(logging.INFO) - logging.getLogger("te_compiler").addHandler(logging.StreamHandler(sys.stdout)) diff --git a/docs/arch/virtual_machine.rst b/docs/arch/virtual_machine.rst deleted file mode 100644 index c532392afeb8..000000000000 --- a/docs/arch/virtual_machine.rst +++ /dev/null @@ -1,410 +0,0 @@ -.. Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - -.. http://www.apache.org/licenses/LICENSE-2.0 - -.. Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. - -Putting the VM in TVM: The Relay Virtual Machine -================================================ - -Relay, a new program representation, has enabled the representation and optimization of -a great breadth of machine learning programs. -Unfortunately, by supporting a more expressive set of programs, we have -introduced several new execution challenges. - -Relay's interpreter can execute the full language but has notable limitations -that make it unsuited for production deployments. It is structured as an inefficient -interpreter that performs AST traversal to execute the program. This approach is conceptually -simple but inefficient, as the AST traversal heavily relies on indirection. - -There are further challenges in compiling dynamic code, such as dynamic scheduling and allocation, -fully dynamic tensor shapes, and control flow. The interpreter offers simple solutions -for these, but none is sufficiently compelling or optimized. - -The second execution mechanism is the existing graph executor. In order to target Relay -programs to this, we compile a small subset of them to the old graph format and execute -them on the runtime. Graph executor provides a fast execution experience but only for a very limited -subset of Relay programs. - -An alternative but not-standard approach is Relay's ahead-of-time compiler, -which compiles a Relay program into a shared library containing an ahead-of-time -implementation. The ahead-of-time compiler provides compelling performance -but is difficult to extend and instrument, which can only be done by modifying the -code generation and optimization mechanisms. - -The Relay virtual machine is intended to be a framework that balances these competing -approaches, providing a dynamic execution environment which can be extended, instrumented, -and integrated with other approaches like ahead-of-time compilation via a flexible extension -mechanism. - -The virtual machine is designed to strike a balance between performance and flexibility -when deploying and executing Relay programs, without giving up the benefits of TVM. - -Virtual machine (VM) design is a well-studied area in programming languages and systems, -and there have been various virtual machine designs for both full-fledged -and embedded programing languages. -Previous language VM designs have been heavily tailored to the execution profile of traditional programs. -Traditional programs manipulate small scalar values and consist of a large number of low-level instructions. -The sheer quantity of instructions requires instruction execution and dispatch to be extremely efficient. -In the context of machine learning we manipulate primarily tensor values, using a (relatively) -low number of high level instructions. ML programs' cost centers are expensive operator invocations, -such as GEMM or convolution, over a large input. Due to the execution profile exhibited by ML programs, -micro-optimizations present in scalar VMs are dramatically less important. - -TVM has provided strong support for vision models, -but we want to grow to support a wider variety of models. -The graph executor is able to utilize the fully static nature of the input graphs to perform -aggressive optimization such as fully static allocation, and optimal memory reuse. -When we introduce models which make use of control flow, recursion, dynamic shapes, and dynamic -allocation, we must change how execution works. A virtual machine for Relay is a natural choice. - -The rest of this document provides a high-level overview of the Relay -virtual machine design and its instruction set. - -Design ------- - -The VM's design is focused on simplicity without sacrificing performance. -In order to accomplish this we have focused on designing a tensor VM rather than a scalar VM. - -In the tensor VM setting, we optimize for cheap “allocation” of objects (by trying to avoid real allocation), -reuse of static fragments, and the ability to do dynamic shape (i.e jagged tensors). - -Instruction Set -~~~~~~~~~~~~~~~ - -The choices of an instruction set and instruction representation are the most critical design decisions for a VM. -The current representation of the instructions is a tagged union containing the op-code and the data payload. An important design decision is the level of abstraction of the instructions (RISC vs. CISC) and how they take their data (fixed-width instruction encoding vs. variable-length encoding). The current version is closer to CISC, with complex instructions like AllocTensor, and is variable-length due to the inclusion of the shape as part of the instruction. The current instruction set is very high-level and corresponds roughly to high-level operations in Relay. - -Ret -^^^ -**Arguments**: -:: - - RegName dst - RegName result - -Returns the object in register ``result`` to caller's register ``dst``. - -InvokePacked -^^^^^^^^^^^^ -**Arguments**: -:: - - Index packed_index - Index arity - Index output_size - RegName* packed_args - -Invoke the packed function denoted by ``packed_index``. The ``arity`` -and ``output_size`` are used to inform the VM how many inputs and -outputs to expect. ``packed_args`` stores the list of argument registers. Note ``Index`` -is an alias of ``int64_t``, and it will be used in other instructions as well. - -AllocTensor -^^^^^^^^^^^ -**Arguments**: -:: - - RegName dst - RegName storage - uint32_t ndim - int64_t* shape - DLDataType dtype - -Allocate a tensor value of using constant shape (stored in ``shape``) and ``dtype`` -from the given storage block, ``storage``. The result is saved to register ``dst``. - -AllocTensorReg -^^^^^^^^^^^^^^ -**Arguments**: -:: - - RegName dst - RegName storage - RegName shape_register - DLDataType dtype - -Allocate a tensor value of the appropriate shape (stored in ``shape_register``) -and ``dtype`` from the given storage block (stored in ``storage``). The result is saved to register ``dst``. - -AllocStorage -^^^^^^^^^^^^ -**Arguments**: -:: - - RegName dst - RegName size - RegName alignment - DLDataType dtype_hint - -Allocate a storage block with the given ``size``, ``alignment`` and data type, ``dtype_hint``. -The allocated storage block is stored in register ``dst``. - -AllocADT -^^^^^^^^ -**Arguments**: -:: - - RegName dst - Index tag - Index num_fields - RegName* datatype_fields - -Allocate a data type with the tag ``tag`` using the ``num_fields`` entries -from registers ``datatype_fields``. The result is saved to register ``dst``. - -AllocClosure -^^^^^^^^^^^^ -**Arguments**: -:: - - RegName dst - Index clo_index - Index num_freevar - RegName* free_vars; - -Allocate a closure with the VMFunction at ``clo_index`` as -its code, and the ``num_freevar`` entries from registers in -``free_vars``. The result is saved to register ``dst``. - -GetField -^^^^^^^^ -**Arguments**: -:: - - RegName dst - RegName object - Index field_index - -Get the field value with index ``field_index`` from ``object``. And saves the result to register ``dst``. - -If -^^ -**Arguments**: -:: - - RegName test - RegName target - Index true_offset - Index false_offset - -Check if the object at register ``test`` is equal to ``target``. -If equal, relative jump by ``true_offset``, else relative -jump by ``false_offset``. - -GetTag -^^^^^^ -**Arguments**: -:: - - RegName object - RegName dst - -Get the object tag for ADT object in register ``object``. And saves the reult to register ``dst``. - -Fatal -^^^^^ -Fail the virtual machine execution. - -Goto -^^^^ -**Arguments**: -:: - - Index pc_offset - -Relative unconditional jump by ``pc_offset``. - -Invoke -^^^^^^ -**Arguments**: -:: - - Index func_index - -Invoke function at ``func_index``, consumes the number of arguments contained in the VMFunction's -arity field. - -InvokeClosure -^^^^^^^^^^^^^ -**Arguments**: -:: - - RegName closure - Index num_closure_args - RegName* closure_args - -Invokes ``closure``, consuming the number of arguments declared in the closure's VMFunction. - -LoadConst -^^^^^^^^^ -**Arguments**: -:: - - RegName dst - Index const_index - -Load the constant at ``const_index`` from the constant pool. The result is saved to register ``dst``. - -LoadConsti -^^^^^^^^^^ -**Arguments**: -:: - - Index val - RegName dst - -Load the constant integer ``val`` to register ``dst``. The result is a 0-rank tensor. - -Object Representation -~~~~~~~~~~~~~~~~~~~~~ -We leverage the object protocol to represent the objects that are used by the -VM. - -Currently, three types of objects, ``NDArray``, ``ADT``, and ``Closure`` objects, are used -to represent tensor, tuple/list, and closure data, respectively. More details -for each of them can be found at `include/tvm/runtime/ndarray.h`_, -`include/tvm/runtime/vm/vm.h`_, and `include/tvm/runtime/container.h`_, respectively. - -.. _include/tvm/runtime/ndarray.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/ndarray.h - -.. _include/tvm/runtime/vm/vm.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/vm/vm.h - -.. _include/tvm/runtime/container.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/container.h - -Stack and State -~~~~~~~~~~~~~~~ - -The Relay VM maintains a stack frame, which contains information about how to resume the -previous call. Registers are allocated in a continuous space (virtual register file) for each function. - -We keep track of a set of Relay functions we have called, a pointer into its bytecode, an offset into the byte code (known as the program counter). - -.. code-block:: c - - struct VirtualMachine { - ... - std::vector frames; - ... - // Current function. - size_t func_index; - // Pointer into the current function's instructions. - const Instruction* code; - // Current program counter relative to the code pointer. - size_t pc; - ... - }; - - -Dispatch Loop -~~~~~~~~~~~~~ -A critical piece of a VM is the dispatch loop. The dispatch loop usually dominates the execution time of a -virtual machine, but we have experimentally found this not to be the case for Relay. We have just implemented -a simple ``switch``/``goto`` dispatch loop which dispatches based on instruction op code. - -This loop is implemented by ``VirtualMachine::Run()``. - -VM Compiler -~~~~~~~~~~~ - -An important part of this infrastructure is a compiler from Relay's full IR into a sequence of bytecode. -The VM compiler transforms a ``tvm::relay::Module`` into a ``tvm::relay::vm::Executable``. The executable -contains a set of compiled functions, the compiled functions are contained in ``tvm::relay::vm::Function``. -The functions contain metadata about the function as well as its compiled bytecode. The emitted executable -object then can be loaded and run by a ``tvm::relay::vm::VirtualMachine`` object. For full definitions of the -data structures, please see `include/tvm/runtime/vm/executable.h`_ and `include/tvm/runtime/vm/vm.h`_. - -.. _include/tvm/runtime/vm/executable.h: https://github.com/apache/tvm/blob/main/include/tvm/runtime/vm/executable.h - -Optimizations -~~~~~~~~~~~~~ - -There are quite a few optimizations required by the VM compiler. Each of them -is implemented as a pass which is managed by the Relay pass manager. - -Optimizations marked with `TODO` are not implemented yet. - -- A-Normal Form -- Lambda Lift (see `src/relay/vm/lambda_lift.cc`_) -- Inline Primitives (see `src/relay/vm/inline_primitives.cc`_) -- Constant Pool Layout (see `src/relay/backend/vm/compiler.cc`_) -- Tail Call Optimization (TODO) -- Liveness Analysis (TODO) - -.. _src/relay/vm/lambda_lift.cc: https://github.com/apache/tvm/blob/main/src/relay/backend/vm/lambda_lift.cc - -.. _src/relay/vm/inline_primitives.cc: https://github.com/apache/tvm/blob/main/src/relay/backend/vm/inline_primitives.cc - -.. _src/relay/backend/vm/compiler.cc: https://github.com/apache/tvm/blob/main/src/relay/backend/vm/compiler.cc - -Serialization -~~~~~~~~~~~~~ - -Serializing and deserializing the executable generated by the Relay VM compiler is a must as -we may want to save the model to the disk and perform inference later. Previously, Relay has produced -a serialized form in a json file for the graph executor. However, the same format is not directly -applicable to the VM as it emits bytecode instead of graph-style programs. -Serialization of an executable essentially needs to handle both model specific -(i.e. weights and kernels) and VM related (i.e. bytecode and global function names) data. - -For kernels, we can conveniently leverage existing TVM infra to save and load -the compiled library module. Here we only focus on serializing other several -components in a binary format that is organized with the following sections in order. - -- Global section. This section contains the globals (function names) used by the virtual machine. - -- Constant section. This section is used to store the constant pool (i.e. weights of the model) - for a virtual machine. - -- Primitive name section. This section is introduced to accommodate the list of primitive - operator names that will be invoked by the virtual machine, i.e. the names - starting with ``fused_``. The primitive names are used as symbols to look up - function pointers in the compiled kernel library. - -- Code section. The VM functions, including bytecode, are sitting in this section. The dispatching - loop iterates through this section to fetch instructions for execution. - -Hence, unlike the graph executor artifact that contains weight (.params), graph json (.json), -and compiled kernel library (.so), the serialized executable artifact is composed of the Relay -object file (.ro) and the compiled kernel library (.so). - -A ``save`` function is implemented to store the executable to the disk and -serialize it into the above format. Meanwhile, a ``load_exec`` function is used to -load the serialized kernel binary and executable related binary code, which will be again used to -instantiate a VM object. Please refer to the `test_vm_serialization.py`_ file for more -examples. - -.. _test_vm_serialization.py: https://github.com/apache/tvm/blob/main/tests/python/relay/test_vm_serialization.py - -Unresolved Questions -~~~~~~~~~~~~~~~~~~~~ - -How do we handle dynamic shapes? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Dynamic shape support is ongoing work in TVM as we upgrade Relay, TVM's compiler. For the most recent updates on -dynamic shape support, we recommend following updates in TVM's Discuss forum (https://discuss.tvm.apache.org/). - -How can we modify the VM to support JIT compilation of certain code paths? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -In the code generation space there are still many tradeoffs to be analyzed and the VM is designed -to be very flexible so we can modify it for future experiments. - -How do we support heterogenous execution? -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Heterogenous execution should work out of the box assuming we have annotated the appropriate device copies. -In order to do this properly we need to run the device annotation and copying passes. diff --git a/docs/deep_dive/relax/index.rst b/docs/deep_dive/relax/index.rst index f891eb2793ec..2b7c4ea599ae 100644 --- a/docs/deep_dive/relax/index.rst +++ b/docs/deep_dive/relax/index.rst @@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License. -.. _relax: +.. _relax-deep-dive: Relax ===== diff --git a/docs/deep_dive/tensor_ir/index.rst b/docs/deep_dive/tensor_ir/index.rst index 46bed7c42319..66e153ec01a5 100644 --- a/docs/deep_dive/tensor_ir/index.rst +++ b/docs/deep_dive/tensor_ir/index.rst @@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License. -.. _tensor-ir: +.. _tensor-ir-deep-dive: TensorIR ======== diff --git a/docs/dev/tutorial/codebase_walkthrough.rst b/docs/dev/tutorial/codebase_walkthrough.rst index 726e253057d0..a349b69f7b58 100644 --- a/docs/dev/tutorial/codebase_walkthrough.rst +++ b/docs/dev/tutorial/codebase_walkthrough.rst @@ -124,7 +124,7 @@ Lowering is done by ``tvm.lower()`` function, defined in ``python/tvm/build_modu stmt = schedule.ScheduleOps(sch, bounds) ... -Bound inference is the process where all loop bounds and sizes of intermediate buffers are inferred. If you target the CUDA backend and you use shared memory, its required minimum size is automatically determined here. Bound inference is implemented in ``src/te/schedule/bound.cc``, ``src/te/schedule/graph.cc`` and ``src/te/schedule/message_passing.cc``. For more information on how bound inference works, see :ref:`dev-InferBound-Pass`. +Bound inference is the process where all loop bounds and sizes of intermediate buffers are inferred. If you target the CUDA backend and you use shared memory, its required minimum size is automatically determined here. Bound inference is implemented in ``src/te/schedule/bound.cc``, ``src/te/schedule/graph.cc`` and ``src/te/schedule/message_passing.cc``. ``stmt``, which is the output of ``ScheduleOps()``, represents an initial loop nest structure. If you have applied ``reorder`` or ``split`` primitives to your schedule, then the initial loop nest already reflects those changes. ``ScheduleOps()`` is defined in ``src/te/schedule/schedule_ops.cc``. diff --git a/docs/index.rst b/docs/index.rst index 2102bdd33a00..3abc39e82fd1 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -54,6 +54,7 @@ driving its costs down. :maxdepth: 2 :caption: Deep Dive + arch/index deep_dive/tensor_ir/index deep_dive/relax/index @@ -73,7 +74,6 @@ driving its costs down. dev/tutorial/index dev/how_to/how_to.rst reference/langref/index - arch/index topic/microtvm/index topic/vta/index From 482d2f6c1bdf920b703980329e6ebaf52283e3d8 Mon Sep 17 00:00:00 2001 From: Siyuan Feng Date: Mon, 23 Sep 2024 09:37:23 +0800 Subject: [PATCH 2/3] lint --- docs/reference/langref/relay_expr.rst | 4 +--- docs/topic/microtvm/index.rst | 7 ------- gallery/how_to/tune_with_autoscheduler/tune_network_arm.py | 1 - .../how_to/tune_with_autoscheduler/tune_network_cuda.py | 1 - .../how_to/tune_with_autoscheduler/tune_network_mali.py | 1 - gallery/how_to/tune_with_autoscheduler/tune_network_x86.py | 1 - 6 files changed, 1 insertion(+), 14 deletions(-) diff --git a/docs/reference/langref/relay_expr.rst b/docs/reference/langref/relay_expr.rst index c50acc2949dd..c789331efe63 100644 --- a/docs/reference/langref/relay_expr.rst +++ b/docs/reference/langref/relay_expr.rst @@ -540,9 +540,7 @@ the graph node will only be evaluated once by the compiled program. These bindings allow for a style of programming that corresponds to that already employed by NNVM and other dataflow graph-based input formats. The fact that the variables are not scoped offers some flexibility in evaluation order compared to :code:`let` -bindings, though this can also introduce some ambiguity in programs (the -:ref:`developer introduction to the Relay IR` includes more detailed discussion -of this nuance). +bindings, though this can also introduce some ambiguity in programs. *Note: Graph bindings are not currently parsed by the text format.* diff --git a/docs/topic/microtvm/index.rst b/docs/topic/microtvm/index.rst index 4dd4ab5d511d..2bac70241d3b 100644 --- a/docs/topic/microtvm/index.rst +++ b/docs/topic/microtvm/index.rst @@ -58,13 +58,6 @@ more as they follow through them. Here is a list of tutorials that you can start 3. Try running a more complex tutorial: :ref:`Creating Your MLPerfTiny Submission with microTVM `. -How microTVM Works -~~~~~~~~~~~~~~~~~~ - - -You can read more about the design of these pieces at the :ref:`microTVM Design Document `. - - Help and Discussion ~~~~~~~~~~~~~~~~~~~ diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py b/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py index d795c3aba245..e4edf0333508 100644 --- a/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py +++ b/gallery/how_to/tune_with_autoscheduler/tune_network_arm.py @@ -70,7 +70,6 @@ # with any layout, we found the best performance is typically achieved with NHWC layout. # We also implemented more optimizations for NHWC layout with the auto-scheduler. # So it is recommended to convert your models to NHWC layout to use the auto-scheduler. -# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. def get_network(name, batch_size, layout="NHWC", dtype="float32", use_sparse=False): diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py index 1f8c0cc13a35..f11aef253f81 100644 --- a/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py +++ b/gallery/how_to/tune_with_autoscheduler/tune_network_cuda.py @@ -64,7 +64,6 @@ # with any layout, we found the best performance is typically achieved with NHWC layout. # We also implemented more optimizations for NHWC layout with the auto-scheduler. # So it is recommended to convert your models to NHWC layout to use the auto-scheduler. -# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. def get_network(name, batch_size, layout="NHWC", dtype="float32"): diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py b/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py index 15f337901360..3120c30cef1a 100644 --- a/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py +++ b/gallery/how_to/tune_with_autoscheduler/tune_network_mali.py @@ -67,7 +67,6 @@ # with any layout, we found the best performance is typically achieved with NHWC layout. # We also implemented more optimizations for NHWC layout with the auto-scheduler. # So it is recommended to convert your models to NHWC layout to use the auto-scheduler. -# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. def get_network(name, batch_size, layout="NHWC", dtype="float32"): diff --git a/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py b/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py index 169567122f79..43314a4b0a2f 100644 --- a/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py +++ b/gallery/how_to/tune_with_autoscheduler/tune_network_x86.py @@ -67,7 +67,6 @@ # with any layout, we found the best performance is typically achieved with NHWC layout. # We also implemented more optimizations for NHWC layout with the auto-scheduler. # So it is recommended to convert your models to NHWC layout to use the auto-scheduler. -# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. def get_network(name, batch_size, layout="NHWC", dtype="float32", use_sparse=False): From c6c3967cbc5b74af745471a7cd712a4237a29562 Mon Sep 17 00:00:00 2001 From: Siyuan Feng Date: Mon, 23 Sep 2024 21:27:22 +0800 Subject: [PATCH 3/3] lint --- gallery/how_to/work_with_microtvm/micro_tvmc.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gallery/how_to/work_with_microtvm/micro_tvmc.sh b/gallery/how_to/work_with_microtvm/micro_tvmc.sh index dded94e55603..bf9338cf5f7f 100755 --- a/gallery/how_to/work_with_microtvm/micro_tvmc.sh +++ b/gallery/how_to/work_with_microtvm/micro_tvmc.sh @@ -96,7 +96,7 @@ wget https://github.com/tensorflow/tflite-micro/raw/a56087ffa2703b4d5632f024a8a4 # # Model Library Format (MLF) is an output format that TVM provides for micro targets. MLF is a tarball # containing a file for each piece of the TVM compiler output which can be used on micro targets outside -# TVM environment. Read more about :ref:`Model Library Format `. +# TVM environment. # # Here, we generate a MLF file for ``qemu_x86`` Zephyr board. You can chooses `aot` or `graph` executor type # to run this tutorial, however, we recommend to use `aot` for microTVM targets since `aot` uses ahead of time