Skip to content

Commit fe8910a

Browse files
committed
[Hardware] Ultra-96 support (apache#16)
* vta ALU fix * allowing for more matrix shapes for batched inference at lower precisions, for narrower input channel vectors * refactoring topi test so same tests can be used by autovta * makefile changes: use simply expanded variables * correctness checks are returned along with other stats * moving to version 0.0.4 * fixing bias shape bug when relying on tensorization * updating drivers * being more specific * prelimiary ultra-96 support * updating drivers to prevent memory leaks, ultra96 support tested in non-coherent mode * adding support for dynamic runtime rebuilding on Ultra-96 * hls support for Ultra-96 * bug fix in old scheduler * taking advantage of Ultra96 device width * updated ultra96 drivers * hardware compilation for ultra96 (WIP) * elif fix * adding target to path * updated freq for ultra96 * not needed param * simulator bug fixes * ARM CPU operator support or Ultra-96 (aarch64 - Cortexa53) * fail elegantly when trying to program FPGA in sim mode * wip - support for ultra96 * fix llvm cmd * rpc server bitstream program cleanup * fix ultra96 driver address map * updating default ultra96 conf, and bitstream format
1 parent d370a98 commit fe8910a

32 files changed

+3196
-477
lines changed

apps/pynq_rpc/start_rpc_server.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,4 @@
22
PROJROOT="$( cd "$( dirname "${BASH_SOURCE[0]}" )/../../" && pwd )"
33

44
export PYTHONPATH=${PYTHONPATH}:${PROJROOT}/python:${PROJROOT}/vta/python
5-
export PYTHONPATH=${PYTHONPATH}:/home/xilinx/pynq
6-
python3 -m vta.exec.rpc_server
5+
python3.6 -m vta.exec.rpc_server

cmake/modules/VTA.cmake

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ elseif(PYTHON)
4343
find_library(__cma_lib NAMES cma PATH /usr/lib)
4444
target_link_libraries(vta ${__cma_lib})
4545
endif()
46+
# Ultra96 rules
47+
if(${VTA_TARGET} STREQUAL "ultra96")
48+
find_library(__sds_lib NAMES sds_lib PATH /usr/lib)
49+
target_link_libraries(vta ${__sds_lib})
50+
endif()
4651
else()
4752
message(STATUS "Cannot found python in env, VTA build is skipped..")
4853
endif()

vta/config/pynq_sample.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"TARGET" : "pynq",
3-
"HW_VER" : "0.0.2",
3+
"HW_VER" : "0.0.4",
44
"HW_FREQ" : 100,
55
"HW_CLK_TARGET" : 7,
66
"ALU_EN" : true,
7-
"MUL_EN" : true,
7+
"MUL_EN" : false,
88
"GEMM_II" : 1,
99
"TALU_II" : 2,
1010
"LOG_INP_WIDTH" : 3,

vta/config/ultra96_sample.json

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"TARGET" : "ultra96",
3+
"HW_VER" : "0.0.4",
4+
"HW_FREQ" : 333,
5+
"HW_CLK_TARGET" : 2,
6+
"ALU_EN" : true,
7+
"MUL_EN" : false,
8+
"GEMM_II" : 1,
9+
"TALU_II" : 2,
10+
"LOG_INP_WIDTH" : 3,
11+
"LOG_WGT_WIDTH" : 3,
12+
"LOG_ACC_WIDTH" : 5,
13+
"LOG_OUT_WIDTH" : 3,
14+
"LOG_BATCH" : 0,
15+
"LOG_BLOCK_IN" : 4,
16+
"LOG_BLOCK_OUT" : 4,
17+
"LOG_UOP_BUFF_SIZE" : 15,
18+
"LOG_INP_BUFF_SIZE" : 15,
19+
"LOG_WGT_BUFF_SIZE" : 18,
20+
"LOG_ACC_BUFF_SIZE" : 17
21+
}

vta/config/vta_config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
{
22
"TARGET" : "sim",
3-
"HW_VER" : "0.0.2",
3+
"HW_VER" : "0.0.4",
44
"HW_FREQ" : 100,
55
"HW_CLK_TARGET" : 7,
66
"ALU_EN" : true,
7-
"MUL_EN" : true,
7+
"MUL_EN" : false,
88
"GEMM_II" : 1,
99
"TALU_II" : 2,
1010
"LOG_INP_WIDTH" : 3,

vta/config/vta_config.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,8 @@ def main():
9797
cfg["LOG_OUT_BUFF_SIZE"] = cfg["LOG_ACC_BUFF_SIZE"] - cfg["LOG_ACC_WIDTH"] + cfg["LOG_OUT_WIDTH"]
9898
# Generate bitstream config string.
9999
# Needs to match the BITSTREAM string in python/vta/environment.py
100-
cfg["BITSTREAM"] = "{}_{}x{}x{}_a{}w{}o{}_{}_{}_{}_{}_{}MHz_{}ns_gii{}".format(
100+
cfg["BITSTREAM"] = "{}_{}_{}x{}x{}_a{}w{}o{}_{}_{}_{}_{}_{}MHz_{}ns_gii{}".format(
101+
cfg["TARGET"],
101102
cfg["HW_VER"].replace('.', '_'),
102103
(1 << cfg["LOG_BATCH"]),
103104
(1 << cfg["LOG_BLOCK_IN"]),
@@ -131,6 +132,8 @@ def main():
131132
cflags_str = " ".join(pkg.cflags)
132133
if cfg["TARGET"] == "pynq":
133134
cflags_str += " -DVTA_TARGET_PYNQ"
135+
if cfg["TARGET"] == "ultra96":
136+
cflags_str += " -DVTA_TARGET_ULTRA96"
134137
print(cflags_str)
135138

136139
if args.ldflags:

vta/hardware/xilinx/Makefile

Lines changed: 18 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,14 @@ VIVADO = vivado
1414
HSI = hsi
1515

1616
# HLS mode
17-
MODE = all
17+
MODE = skip_sim
1818
# Debug flag
1919
DEBUG = False
2020
# SLURM
2121
SLURM = False
2222

2323
# Process VTA JSON config
24-
VTA_CONFIG = python $(CURDIR)/../../config/vta_config.py
24+
VTA_CONFIG := python $(CURDIR)/../../config/vta_config.py
2525
CFLAGS := $(shell ${VTA_CONFIG} --cflags)
2626
VTA_TARGET := $(shell ${VTA_CONFIG} --target)
2727

@@ -46,10 +46,10 @@ VTA_MUL_EN := $(shell ${VTA_CONFIG} --get-mulen)
4646
#---------------------
4747
# FPGA Parameters
4848
#--------------------
49-
VTA_CLOCK_FREQ = $(shell ${VTA_CONFIG} --get-fpgafreq)
50-
VTA_TARGET_PER = $(shell ${VTA_CONFIG} --get-fpgaper)
51-
VTA_GEMM_II = $(shell ${VTA_CONFIG} --get-gemmii)
52-
VTA_TALU_II = $(shell ${VTA_CONFIG} --get-taluii)
49+
VTA_CLOCK_FREQ := $(shell ${VTA_CONFIG} --get-fpgafreq)
50+
VTA_TARGET_PER := $(shell ${VTA_CONFIG} --get-fpgaper)
51+
VTA_GEMM_II := $(shell ${VTA_CONFIG} --get-gemmii)
52+
VTA_TALU_II := $(shell ${VTA_CONFIG} --get-taluii)
5353

5454
#---------------------
5555
# Compilation parameters
@@ -59,20 +59,21 @@ VTA_TALU_II = $(shell ${VTA_CONFIG} --get-taluii)
5959
VTA_HW_COMP_THREADS = 8
6060

6161
# Derive config name
62-
CONF = $(shell ${VTA_CONFIG} --cfg-str)
63-
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
64-
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
62+
CONF := $(shell ${VTA_CONFIG} --cfg-str)
63+
IP_BUILD_PATH := $(BUILD_DIR)/hls/$(CONF)
64+
HW_BUILD_PATH := $(BUILD_DIR)/vivado/$(CONF)
6565

66-
ifeq ($(SLURM), true)
66+
# Build on local scratch drive when using cluster
67+
ifeq ($(SLURM), True)
6768
IP_BUILD_PATH = /scratch/hls/$(CONF)
6869
HW_BUILD_PATH = /scratch/vivado/$(CONF)
6970
endif
7071

7172
# IP file path
72-
IP_PATH = $(BUILD_DIR)/hls/$(CONF)/solution0/impl/ip/xilinx_com_hls_vta_1_0.zip
73+
IP_PATH := $(BUILD_DIR)/hls/$(CONF)/vta_compute/solution0/impl/ip/xilinx_com_hls_compute_1_0.zip
7374

7475
# Bitstream file path
75-
BIT_PATH = $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
76+
BIT_PATH := $(BUILD_DIR)/vivado/$(CONF)/export/$(CONF).bit
7677

7778
.PHONY: all ip bit bsp clean clean_all
7879

@@ -84,27 +85,28 @@ $(IP_PATH): $(SRC_DIR)/*
8485
mkdir -p $(IP_BUILD_PATH)
8586
cd $(IP_BUILD_PATH) && \
8687
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
87-
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
88+
-tclargs $(VTA_TARGET) \
89+
$(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) \
8890
$(MODE) $(DEBUG) $(VTA_ALU_EN) $(VTA_MUL_EN) \
8991
$(VTA_TARGET_PER) $(VTA_GEMM_II) $(VTA_TALU_II) \
9092
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_ACC_WIDTH) $(VTA_OUT_WIDTH) \
9193
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
9294
$(VTA_UOP_BUFF_SIZE) $(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) \
9395
$(VTA_ACC_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
94-
ifeq ($(SLURM), true)
96+
ifeq ($(SLURM), True)
9597
mkdir -p $(BUILD_DIR)/hls
9698
mv $(IP_BUILD_PATH) $(BUILD_DIR)/hls/.
9799
endif
98100

99101
$(BIT_PATH): $(IP_PATH)
100102
mkdir -p $(HW_BUILD_PATH)
101103
cd $(HW_BUILD_PATH) && \
102-
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
104+
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/ultra96.tcl \
103105
-tclargs $(BUILD_DIR)/hls/$(CONF) $(VTA_HW_COMP_THREADS) $(VTA_CLOCK_FREQ) $(VTA_GEMM_II) \
104106
$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(VTA_OUT_WIDTH) \
105107
$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
106108
$(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
107-
ifeq ($(SLURM), true)
109+
ifeq ($(SLURM), True)
108110
mkdir -p $(BUILD_DIR)/vivado
109111
mv $(HW_BUILD_PATH) $(BUILD_DIR)/vivado/.
110112
endif

vta/hardware/xilinx/scripts/hls.tcl

Lines changed: 63 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -5,55 +5,58 @@
55
#
66

77
# Command line arguments:
8-
# Arg 1: path to design sources
9-
# Arg 2: path to sim sources
10-
# Arg 3: path to test sources
11-
# Arg 4: path to include sources
12-
# Arg 5: mode
13-
# Arg 6: debug
14-
# Arg 7: alu_ena
15-
# Arg 8: mul_ena
16-
# Arg 9: target clock period
17-
# Arg 10: target II for GEMM
18-
# Arg 11: target II for tensor ALU
19-
# Arg 12: input type width (log)
20-
# Arg 13: weight type width (log)
21-
# Arg 14: accum type width (log)
22-
# Arg 15: output type width (log)
23-
# Arg 16: batch size (log)
24-
# Arg 17: in block size (log)
25-
# Arg 18: out block size (log)
26-
# Arg 19: uop buffer size in B (log)
27-
# Arg 20: inp buffer size in B (log)
28-
# Arg 21: wgt buffer size in B (log)
29-
# Arg 22: acc buffer size in B (log)
30-
# Arg 23: out buffer size in B (log)
31-
32-
if { [llength $argv] eq 25 } {
33-
set src_dir [lindex $argv 2]
34-
set sim_dir [lindex $argv 3]
35-
set test_dir [lindex $argv 4]
36-
set include_dir [lindex $argv 5]
37-
set mode [lindex $argv 6]
38-
set debug [lindex $argv 7]
39-
set alu_ena [lindex $argv 8]
40-
set mul_ena [lindex $argv 9]
41-
set target_period [lindex $argv 10]
42-
set target_gemm_ii [lindex $argv 11]
43-
set target_alu_ii [lindex $argv 12]
44-
set inp_width [lindex $argv 13]
45-
set wgt_width [lindex $argv 14]
46-
set acc_width [lindex $argv 15]
47-
set out_width [lindex $argv 16]
48-
set batch [lindex $argv 17]
49-
set block_in [lindex $argv 18]
50-
set block_out [lindex $argv 19]
51-
set uop_buff_size [lindex $argv 20]
52-
set inp_buff_size [lindex $argv 21]
53-
set wgt_buff_size [lindex $argv 22]
54-
set acc_buff_size [lindex $argv 23]
55-
set out_buff_size [lindex $argv 24]
8+
# Arg 1: target (FPGA)
9+
# Arg 2: path to design sources
10+
# Arg 3: path to sim sources
11+
# Arg 4: path to test sources
12+
# Arg 5: path to include sources
13+
# Arg 6: mode
14+
# Arg 7: debug
15+
# Arg 8: alu_ena
16+
# Arg 9: mul_ena
17+
# Arg 10: target clock period
18+
# Arg 11: target II for GEMM
19+
# Arg 12: target II for tensor ALU
20+
# Arg 13: input type width (log)
21+
# Arg 14: weight type width (log)
22+
# Arg 15: accum type width (log)
23+
# Arg 16: output type width (log)
24+
# Arg 17: batch size (log)
25+
# Arg 18: in block size (log)
26+
# Arg 19: out block size (log)
27+
# Arg 20: uop buffer size in B (log)
28+
# Arg 21: inp buffer size in B (log)
29+
# Arg 22: wgt buffer size in B (log)
30+
# Arg 23: acc buffer size in B (log)
31+
# Arg 24: out buffer size in B (log)
32+
33+
if { [llength $argv] eq 26 } {
34+
set target [lindex $argv 2]
35+
set src_dir [lindex $argv 3]
36+
set sim_dir [lindex $argv 4]
37+
set test_dir [lindex $argv 5]
38+
set include_dir [lindex $argv 6]
39+
set mode [lindex $argv 7]
40+
set debug [lindex $argv 8]
41+
set alu_ena [lindex $argv 9]
42+
set mul_ena [lindex $argv 10]
43+
set target_period [lindex $argv 11]
44+
set target_gemm_ii [lindex $argv 12]
45+
set target_alu_ii [lindex $argv 13]
46+
set inp_width [lindex $argv 14]
47+
set wgt_width [lindex $argv 15]
48+
set acc_width [lindex $argv 16]
49+
set out_width [lindex $argv 17]
50+
set batch [lindex $argv 18]
51+
set block_in [lindex $argv 19]
52+
set block_out [lindex $argv 20]
53+
set uop_buff_size [lindex $argv 21]
54+
set inp_buff_size [lindex $argv 22]
55+
set wgt_buff_size [lindex $argv 23]
56+
set acc_buff_size [lindex $argv 24]
57+
set out_buff_size [lindex $argv 25]
5658
} else {
59+
set target "pynq"
5760
set src_dir "../src"
5861
set sim_dir "../sim"
5962
set test_dir "../../src/test"
@@ -83,16 +86,20 @@ if { [llength $argv] eq 25 } {
8386
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
8487
# This is necessary because of a Vivado restriction that doesn't allow for
8588
# buses wider than 1024 bits.
86-
proc init_design {per g_ii a_ii inp_width wgt_width out_width acc_width batch block_in block_out alu_ena} {
89+
proc init_design {target per g_ii a_ii inp_width wgt_width out_width acc_width batch block_in block_out alu_ena} {
8790

8891
# Set device number
89-
set_part {xc7z020clg484-1}
92+
if {$target=="pynq"} {
93+
set_part {xc7z020clg484-1}
94+
} elseif {$target=="ultra96"} {
95+
set_part {xczu3eg-sbva484-1-e}
96+
}
9097

9198
# Max bus width (supported by Vivado)
9299
set max_width 1024
93100

94101
# Set axi width (TODO derive from top level config)
95-
set axi_width 64
102+
set axi_width 128
96103

97104
# Set the clock frequency
98105
create_clock -period $per -name default
@@ -178,7 +185,7 @@ if {$mode=="all" || $mode=="sim"} {
178185
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
179186
add_files -tb $test_dir/test_lib.cc -cflags $cflags
180187
open_solution "solution0"
181-
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
188+
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
182189
csim_design -clean
183190
close_project
184191
}
@@ -189,7 +196,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="fetch"} {
189196
set_top fetch
190197
add_files $src_dir/vta.cc -cflags $cflags
191198
open_solution "solution0"
192-
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
199+
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
193200
csynth_design
194201
if {$mode=="all" || $mode=="skip_sim"} {
195202
export_design -format ip_catalog
@@ -203,7 +210,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="load"} {
203210
set_top load
204211
add_files $src_dir/vta.cc -cflags $cflags
205212
open_solution "solution0"
206-
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
213+
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
207214
csynth_design
208215
if {$mode=="all" || $mode=="skip_sim"} {
209216
export_design -format ip_catalog
@@ -217,7 +224,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="compute"} {
217224
set_top compute
218225
add_files $src_dir/vta.cc -cflags $cflags
219226
open_solution "solution0"
220-
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
227+
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
221228
csynth_design
222229
if {$mode=="all" || $mode=="skip_sim"} {
223230
export_design -format ip_catalog
@@ -231,7 +238,7 @@ if {$mode=="all" || $mode=="skip_sim" || $mode=="store"} {
231238
set_top store
232239
add_files $src_dir/vta.cc -cflags $cflags
233240
open_solution "solution0"
234-
init_design $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
241+
init_design $target $target_period $target_gemm_ii $target_alu_ii $inp_width $wgt_width $out_width $acc_width $batch $block_in $block_out $alu_ena
235242
csynth_design
236243
if {$mode=="all" || $mode=="skip_sim"} {
237244
export_design -format ip_catalog

0 commit comments

Comments
 (0)