Skip to content

Commit cdabfa9

Browse files
vegaluisjoseWei Chen
authored andcommitted
[VTA] [Hardware] Chisel implementation (apache#3258)
1 parent 92c10ec commit cdabfa9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+4784
-23
lines changed

cmake/config.cmake

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -135,9 +135,6 @@ set(USE_TENSORRT OFF)
135135
# Build ANTLR parser for Relay text format
136136
set(USE_ANTLR OFF)
137137

138-
# Build TSIM for VTA
139-
set(USE_VTA_TSIM OFF)
140-
141138
# Whether use Relay debug mode
142139
set(USE_RELAY_DEBUG OFF)
143140

cmake/modules/VTA.cmake

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,7 @@ elseif(PYTHON)
2929
--use-cfg=${CMAKE_CURRENT_BINARY_DIR}/vta_config.json)
3030
endif()
3131

32-
execute_process(COMMAND ${VTA_CONFIG} --target OUTPUT_VARIABLE __vta_target)
33-
string(STRIP ${__vta_target} VTA_TARGET)
32+
execute_process(COMMAND ${VTA_CONFIG} --target OUTPUT_VARIABLE VTA_TARGET OUTPUT_STRIP_TRAILING_WHITESPACE)
3433

3534
message(STATUS "Build VTA runtime with target: " ${VTA_TARGET})
3635

@@ -44,6 +43,13 @@ elseif(PYTHON)
4443

4544
add_library(vta SHARED ${VTA_RUNTIME_SRCS})
4645

46+
if(${VTA_TARGET} STREQUAL "tsim")
47+
target_compile_definitions(vta PUBLIC USE_TSIM)
48+
include_directories("vta/include")
49+
file(GLOB RUNTIME_DPI_SRCS vta/src/dpi/module.cc)
50+
list(APPEND RUNTIME_SRCS ${RUNTIME_DPI_SRCS})
51+
endif()
52+
4753
target_include_directories(vta PUBLIC vta/include)
4854

4955
foreach(__def ${VTA_DEFINITIONS})
@@ -61,12 +67,6 @@ elseif(PYTHON)
6167
target_link_libraries(vta ${__cma_lib})
6268
endif()
6369

64-
if(NOT USE_VTA_TSIM STREQUAL "OFF")
65-
include_directories("vta/include")
66-
file(GLOB RUNTIME_DPI_SRCS vta/src/dpi/module.cc)
67-
list(APPEND RUNTIME_SRCS ${RUNTIME_DPI_SRCS})
68-
endif()
69-
7070
else()
7171
message(STATUS "Cannot found python in env, VTA build is skipped..")
7272
endif()

vta/apps/tsim_example/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ sudo apt install verilator sbt
4949
## Setup in TVM
5050

5151
1. Install `verilator` and `sbt` as described above
52-
2. Enable VTA TSIM by turning on the switch `USE_VTA_TSIM` in config.cmake
52+
2. Set the VTA TARGET to `tsim` on `<tvm-root>/vta/config/vta_config.json`
5353
3. Build tvm
5454

5555
## How to run VTA TSIM examples

vta/apps/tsim_example/cmake/modules/hw.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ else()
124124
file(GLOB VERILATOR_SRC ${VTA_HW_DPI_DIR}/tsim_device.cc)
125125
add_library(hw SHARED ${VERILATOR_LIB_SRC} ${VERILATOR_GEN_SRC} ${VERILATOR_SRC})
126126

127-
set(VERILATOR_DEF VL_TSIM_NAME=V${TSIM_TOP_NAME} VL_PRINTF=printf VM_COVERAGE=0 VM_SC=0)
127+
set(VERILATOR_DEF VL_USER_FINISH VL_TSIM_NAME=V${TSIM_TOP_NAME} VL_PRINTF=printf VM_COVERAGE=0 VM_SC=0)
128128
if (NOT TSIM_USE_TRACE STREQUAL "OFF")
129129
list(APPEND VERILATOR_DEF VM_TRACE=1 TSIM_TRACE_FILE=${TSIM_BUILD_DIR}/${TSIM_TRACE_NAME}.vcd)
130130
else()

vta/hardware/chisel/Makefile

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,5 +15,81 @@
1515
# specific language governing permissions and limitations
1616
# under the License.
1717

18+
CONFIG = DefaultF1Config
19+
TOP = VTA
20+
TOP_TEST = Test
21+
BUILD_NAME = build
22+
USE_TRACE = 0
23+
VTA_LIBNAME = libvta_hw
24+
25+
config_test = $(TOP_TEST)$(CONFIG)
26+
vta_dir = $(abspath ../../)
27+
tvm_dir = $(abspath ../../../)
28+
verilator_inc_dir = /usr/local/share/verilator/include
29+
verilator_build_dir = $(vta_dir)/$(BUILD_NAME)/verilator
30+
chisel_build_dir = $(vta_dir)/$(BUILD_NAME)/chisel
31+
32+
verilator_opt = --cc
33+
verilator_opt += +define+RANDOMIZE_GARBAGE_ASSIGN
34+
verilator_opt += +define+RANDOMIZE_REG_INIT
35+
verilator_opt += +define+RANDOMIZE_MEM_INIT
36+
verilator_opt += --x-assign unique
37+
verilator_opt += --output-split 20000
38+
verilator_opt += --output-split-cfuncs 20000
39+
verilator_opt += --top-module ${TOP_TEST}
40+
verilator_opt += -Mdir ${verilator_build_dir}
41+
verilator_opt += -I$(chisel_build_dir)
42+
43+
cxx_flags = -O2 -Wall -fPIC -shared
44+
cxx_flags += -fvisibility=hidden -std=c++11
45+
cxx_flags += -DVL_TSIM_NAME=V$(TOP_TEST)
46+
cxx_flags += -DVL_PRINTF=printf
47+
cxx_flags += -DVL_USER_FINISH
48+
cxx_flags += -DVM_COVERAGE=0
49+
cxx_flags += -DVM_SC=0
50+
cxx_flags += -Wno-sign-compare
51+
cxx_flags += -include V$(TOP_TEST).h
52+
cxx_flags += -I$(verilator_build_dir)
53+
cxx_flags += -I$(verilator_inc_dir)
54+
cxx_flags += -I$(verilator_inc_dir)/vltstd
55+
cxx_flags += -I$(vta_dir)/include
56+
cxx_flags += -I$(tvm_dir)/include
57+
cxx_flags += -I$(tvm_dir)/3rdparty/dlpack/include
58+
59+
cxx_files = $(verilator_inc_dir)/verilated.cpp
60+
cxx_files += $(verilator_inc_dir)/verilated_dpi.cpp
61+
cxx_files += $(wildcard $(verilator_build_dir)/*.cpp)
62+
cxx_files += $(vta_dir)/hardware/dpi/tsim_device.cc
63+
64+
ifneq ($(USE_TRACE), 0)
65+
verilator_opt += --trace
66+
cxx_flags += -DVM_TRACE=1
67+
cxx_flags += -DTSIM_TRACE_FILE=$(verilator_build_dir)/$(TOP_TEST).vcd
68+
cxx_files += $(verilator_inc_dir)/verilated_vcd_c.cpp
69+
else
70+
cxx_flags += -DVM_TRACE=0
71+
endif
72+
73+
default: lib
74+
75+
lib: $(vta_dir)/$(BUILD_NAME)/$(VTA_LIBNAME).so
76+
$(vta_dir)/$(BUILD_NAME)/$(VTA_LIBNAME).so: $(verilator_build_dir)/V$(TOP_TEST).cpp
77+
g++ $(cxx_flags) $(cxx_files) -o $@
78+
79+
verilator: $(verilator_build_dir)/V$(TOP_TEST).cpp
80+
$(verilator_build_dir)/V$(TOP_TEST).cpp: $(chisel_build_dir)/$(TOP_TEST).$(CONFIG).v
81+
verilator $(verilator_opt) $<
82+
83+
verilog: $(chisel_build_dir)/$(TOP).$(CONFIG).v
84+
$(chisel_build_dir)/$(TOP).$(CONFIG).v:
85+
sbt 'runMain vta.$(CONFIG) --target-dir $(chisel_build_dir) --top-name $(TOP).$(CONFIG)'
86+
87+
verilog_test: $(chisel_build_dir)/$(TOP_TEST).$(CONFIG).v
88+
$(chisel_build_dir)/$(TOP_TEST).$(CONFIG).v:
89+
sbt 'runMain vta.$(config_test) --target-dir $(chisel_build_dir) --top-name $(TOP_TEST).$(CONFIG)'
90+
1891
clean:
1992
-rm -rf target project/target project/project
93+
94+
cleanall:
95+
-rm -rf $(vta_dir)/$(BUILD_NAME)

vta/hardware/chisel/src/main/resources/verilog/VTAHostDPI.v

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ module VTAHostDPI #
112112

113113
always_ff @(posedge clock) begin
114114
if (__exit == 'd1) begin
115-
$display("[DONE] at cycle:%016d", cycles);
115+
$display("[TSIM] Verilog $finish called at cycle:%016d", cycles);
116116
$finish;
117117
end
118118
end
Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package vta.core
21+
22+
import chisel3._
23+
import chisel3.util._
24+
import vta.util.config._
25+
import vta.shell._
26+
27+
/** Compute.
28+
*
29+
* The compute unit is in charge of the following:
30+
* - Loading micro-ops from memory (loadUop module)
31+
* - Loading biases (acc) from memory (tensorAcc module)
32+
* - Compute ALU instructions (tensorAlu module)
33+
* - Compute GEMM instructions (tensorGemm module)
34+
*/
35+
class Compute(debug: Boolean = false)(implicit p: Parameters) extends Module {
36+
val mp = p(ShellKey).memParams
37+
val io = IO(new Bundle {
38+
val i_post = Vec(2, Input(Bool()))
39+
val o_post = Vec(2, Output(Bool()))
40+
val inst = Flipped(Decoupled(UInt(INST_BITS.W)))
41+
val uop_baddr = Input(UInt(mp.addrBits.W))
42+
val acc_baddr = Input(UInt(mp.addrBits.W))
43+
val vme_rd = Vec(2, new VMEReadMaster)
44+
val inp = new TensorMaster(tensorType = "inp")
45+
val wgt = new TensorMaster(tensorType = "wgt")
46+
val out = new TensorMaster(tensorType = "out")
47+
val finish = Output(Bool())
48+
})
49+
val sIdle :: sSync :: sExe :: Nil = Enum(3)
50+
val state = RegInit(sIdle)
51+
52+
val s = Seq.tabulate(2)(_ => Module(new Semaphore(counterBits = 8, counterInitValue = 0)))
53+
54+
val loadUop = Module(new LoadUop)
55+
val tensorAcc = Module(new TensorLoad(tensorType = "acc"))
56+
val tensorGemm = Module(new TensorGemm)
57+
val tensorAlu = Module(new TensorAlu)
58+
59+
val inst_q = Module(new Queue(UInt(INST_BITS.W), p(CoreKey).instQueueEntries))
60+
61+
// decode
62+
val dec = Module(new ComputeDecode)
63+
dec.io.inst := inst_q.io.deq.bits
64+
65+
val inst_type = Cat(dec.io.isFinish,
66+
dec.io.isAlu,
67+
dec.io.isGemm,
68+
dec.io.isLoadAcc,
69+
dec.io.isLoadUop).asUInt
70+
71+
val sprev = inst_q.io.deq.valid & Mux(dec.io.pop_prev, s(0).io.sready, true.B)
72+
val snext = inst_q.io.deq.valid & Mux(dec.io.pop_next, s(1).io.sready, true.B)
73+
val start = snext & sprev
74+
val done =
75+
MuxLookup(inst_type,
76+
false.B, // default
77+
Array(
78+
"h_01".U -> loadUop.io.done,
79+
"h_02".U -> tensorAcc.io.done,
80+
"h_04".U -> tensorGemm.io.done,
81+
"h_08".U -> tensorAlu.io.done,
82+
"h_10".U -> true.B // Finish
83+
)
84+
)
85+
86+
// control
87+
switch (state) {
88+
is (sIdle) {
89+
when (start) {
90+
when (dec.io.isSync) {
91+
state := sSync
92+
} .elsewhen (inst_type.orR) {
93+
state := sExe
94+
}
95+
}
96+
}
97+
is (sSync) {
98+
state := sIdle
99+
}
100+
is (sExe) {
101+
when (done) {
102+
state := sIdle
103+
}
104+
}
105+
}
106+
107+
// instructions
108+
inst_q.io.enq <> io.inst
109+
inst_q.io.deq.ready := (state === sExe & done) | (state === sSync)
110+
111+
// uop
112+
loadUop.io.start := state === sIdle & start & dec.io.isLoadUop
113+
loadUop.io.inst := inst_q.io.deq.bits
114+
loadUop.io.baddr := io.uop_baddr
115+
io.vme_rd(0) <> loadUop.io.vme_rd
116+
loadUop.io.uop.idx <> Mux(dec.io.isGemm, tensorGemm.io.uop.idx, tensorAlu.io.uop.idx)
117+
118+
// acc
119+
tensorAcc.io.start := state === sIdle & start & dec.io.isLoadAcc
120+
tensorAcc.io.inst := inst_q.io.deq.bits
121+
tensorAcc.io.baddr := io.acc_baddr
122+
tensorAcc.io.tensor.rd.idx <> Mux(dec.io.isGemm, tensorGemm.io.acc.rd.idx, tensorAlu.io.acc.rd.idx)
123+
tensorAcc.io.tensor.wr <> Mux(dec.io.isGemm, tensorGemm.io.acc.wr, tensorAlu.io.acc.wr)
124+
io.vme_rd(1) <> tensorAcc.io.vme_rd
125+
126+
// gemm
127+
tensorGemm.io.start := state === sIdle & start & dec.io.isGemm
128+
tensorGemm.io.inst := inst_q.io.deq.bits
129+
tensorGemm.io.uop.data.valid := loadUop.io.uop.data.valid & dec.io.isGemm
130+
tensorGemm.io.uop.data.bits <> loadUop.io.uop.data.bits
131+
tensorGemm.io.inp <> io.inp
132+
tensorGemm.io.wgt <> io.wgt
133+
tensorGemm.io.acc.rd.data.valid := tensorAcc.io.tensor.rd.data.valid & dec.io.isGemm
134+
tensorGemm.io.acc.rd.data.bits <> tensorAcc.io.tensor.rd.data.bits
135+
tensorGemm.io.out.rd.data.valid := io.out.rd.data.valid & dec.io.isGemm
136+
tensorGemm.io.out.rd.data.bits <> io.out.rd.data.bits
137+
138+
// alu
139+
tensorAlu.io.start := state === sIdle & start & dec.io.isAlu
140+
tensorAlu.io.inst := inst_q.io.deq.bits
141+
tensorAlu.io.uop.data.valid := loadUop.io.uop.data.valid & dec.io.isAlu
142+
tensorAlu.io.uop.data.bits <> loadUop.io.uop.data.bits
143+
tensorAlu.io.acc.rd.data.valid := tensorAcc.io.tensor.rd.data.valid & dec.io.isAlu
144+
tensorAlu.io.acc.rd.data.bits <> tensorAcc.io.tensor.rd.data.bits
145+
tensorAlu.io.out.rd.data.valid := io.out.rd.data.valid & dec.io.isAlu
146+
tensorAlu.io.out.rd.data.bits <> io.out.rd.data.bits
147+
148+
// out
149+
io.out.rd.idx <> Mux(dec.io.isGemm, tensorGemm.io.out.rd.idx, tensorAlu.io.out.rd.idx)
150+
io.out.wr <> Mux(dec.io.isGemm, tensorGemm.io.out.wr, tensorAlu.io.out.wr)
151+
152+
// semaphore
153+
s(0).io.spost := io.i_post(0)
154+
s(1).io.spost := io.i_post(1)
155+
s(0).io.swait := dec.io.pop_prev & (state === sIdle & start)
156+
s(1).io.swait := dec.io.pop_next & (state === sIdle & start)
157+
io.o_post(0) := dec.io.push_prev & ((state === sExe & done) | (state === sSync))
158+
io.o_post(1) := dec.io.push_next & ((state === sExe & done) | (state === sSync))
159+
160+
// finish
161+
io.finish := state === sExe & done & dec.io.isFinish
162+
163+
// debug
164+
if (debug) {
165+
// start
166+
when (state === sIdle && start) {
167+
when (dec.io.isSync) {
168+
printf("[Compute] start sync\n")
169+
} .elsewhen (dec.io.isLoadUop) {
170+
printf("[Compute] start load uop\n")
171+
} .elsewhen (dec.io.isLoadAcc) {
172+
printf("[Compute] start load acc\n")
173+
} .elsewhen (dec.io.isGemm) {
174+
printf("[Compute] start gemm\n")
175+
} .elsewhen (dec.io.isAlu) {
176+
printf("[Compute] start alu\n")
177+
} .elsewhen (dec.io.isFinish) {
178+
printf("[Compute] start finish\n")
179+
}
180+
}
181+
// done
182+
when (state === sSync) {
183+
printf("[Compute] done sync\n")
184+
}
185+
when (state === sExe) {
186+
when (done) {
187+
when (dec.io.isLoadUop) {
188+
printf("[Compute] done load uop\n")
189+
} .elsewhen (dec.io.isLoadAcc) {
190+
printf("[Compute] done load acc\n")
191+
} .elsewhen (dec.io.isGemm) {
192+
printf("[Compute] done gemm\n")
193+
} .elsewhen (dec.io.isAlu) {
194+
printf("[Compute] done alu\n")
195+
} .elsewhen (dec.io.isFinish) {
196+
printf("[Compute] done finish\n")
197+
}
198+
}
199+
}
200+
}
201+
}

0 commit comments

Comments
 (0)