Skip to content
Snippets Groups Projects
Commit 47001850 authored by Thierry Moreau's avatar Thierry Moreau Committed by Tianqi Chen
Browse files

hardware compilation flow, and driver tests

parent b8d8e5b6
No related branches found
No related tags found
No related merge requests found
Showing
with 7174 additions and 1 deletion
# vta
Open Hardware/Software Stack for Vertical Deep Learning System Optimization
==============================================
[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
VTA is an open hardware/software co-design stack for deep learning systems systems.
It provides a customizable hardware accelerator template for deep learning inference workloads,
combined with a fully functional compiler stack built with TVM.
License
-------
© Contributors, 2018. Licensed under an [Apache-2.0](https://github.com/tmoreau89/vta/blob/master/LICENSE) license.
doxygen
This diff is collapsed.
build
# Directories
ROOTDIR = $(CURDIR)
BUILD_DIR = $(ROOTDIR)/build
SCRIPT_DIR = $(ROOTDIR)/scripts
SRC_DIR = $(ROOTDIR)/../../src/hardware/hls
SIM_DIR = $(ROOTDIR)/sim
TEST_DIR = $(ROOTDIR)/../../src/test
INCLUDE_DIR = $(ROOTDIR)/../../include
# Executables
VIVADO_HLS = vivado_hls
VIVADO = vivado
HSI = hsi
# Build parameters:
# Number of threads during compilation
NUM_THREADS = 8
# Target Frequency
CLOCK_FREQ = 100
# Log of input width in bits
LOG_INP_WIDTH = 3
# Log of weight width in bits
LOG_WGT_WIDTH = 3
# Log of accum width in bits
LOG_ACC_WIDTH = 5
# Log of output width in bits
LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
# Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
LOG_BATCH = 0
# Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
LOG_IN_BLOCK = 4
# Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
LOG_OUT_BLOCK = 4
# Log of uop buffer size in Bytes
LOG_UOP_BUFF_SIZE = 15
# Log of inp buffer size in Bytes
LOG_INP_BUFF_SIZE = 15
# Log of wgt buffer size in Bytes
LOG_WGT_BUFF_SIZE = 15
# Log of acc buffer size in Bytes
LOG_ACC_BUFF_SIZE = 17
# Log of out buffer size in Bytes
LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
# Derived parameter
# Input width in bits
INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
# Weight width in bits
WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
# Output width in bits
OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
# Tensor batch size
BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
# Tensor outer block size
IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_IN_BLOCK) ))" )
# Tensor inner block size
OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_OUT_BLOCK) ))" )
# Uop buffer size in Bytes
UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
# Inp buffer size in Bytes
INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
# Wgt buffer size in Bytes
WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
# Acc buffer size in Bytes
ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
# Out buffer size in Bytes
OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
# Derive clock target period
TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" )
# Derive config name
CONF = \
$(BATCH)x$(IN_BLOCK)x$(OUT_BLOCK)_$(INP_WIDTH)bx$(WGT_WIDTH)b_$(CLOCK_FREQ)MHz_$(TARGET_PER)ns
IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
.PHONY: all ip bit driver clean
all: driver
ip:
mkdir -p $(IP_BUILD_PATH)
cd $(IP_BUILD_PATH) && \
$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \
$(LOG_BATCH) $(LOG_OUT_BLOCK) $(LOG_IN_BLOCK) \
$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \
$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE)
bit: ip
mkdir -p $(HW_BUILD_PATH)
cd $(HW_BUILD_PATH) && \
$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
-tclargs $(IP_BUILD_PATH) $(NUM_THREADS) $(CLOCK_FREQ) \
$(INP_WIDTH) $(WGT_WIDTH) $(OUT_WIDTH) \
$(BATCH) $(IN_BLOCK) $(OUT_BLOCK) \
$(INP_BUFF_SIZE) $(WGT_BUFF_SIZE) $(OUT_BUFF_SIZE)
driver: bit
cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
cd $(HW_BUILD_PATH)/bsp && make
clean:
rm -rf build
\ No newline at end of file
#
# Copyright (c) 2018 by Contributors
# file: hls.tcl
# brief: HLS generation script.
#
# Command line arguments:
# Arg 1: path to design sources
# Arg 2: path to sim sources
# Arg 3: path to test sources
# Arg 4: path to include sources
# Arg 5: target clock period
# Arg 6: input type width (log)
# Arg 7: weight type width (log)
# Arg 8: accum type width (log)
# Arg 9: output type width (log)
# Arg 10: batch size (log)
# Arg 11: in block size (log)
# Arg 12: out block size (log)
# Arg 13: uop buffer size in B (log)
# Arg 14: inp buffer size in B (log)
# Arg 15: wgt buffer size in B (log)
# Arg 16: acc buffer size in B (log)
# Arg 17: out buffer size in B (log)
if { [llength $argv] eq 19 } {
set src_dir [lindex $argv 2]
set sim_dir [lindex $argv 3]
set test_dir [lindex $argv 4]
set include_dir [lindex $argv 5]
set target_period [lindex $argv 6]
set inp_width [lindex $argv 7]
set wgt_width [lindex $argv 8]
set acc_width [lindex $argv 9]
set out_width [lindex $argv 10]
set batch [lindex $argv 11]
set block_in [lindex $argv 12]
set block_out [lindex $argv 13]
set uop_buff_size [lindex $argv 14]
set inp_buff_size [lindex $argv 15]
set wgt_buff_size [lindex $argv 16]
set acc_buff_size [lindex $argv 17]
set out_buff_size [lindex $argv 18]
} else {
set src_dir "../src/"
set sim_dir "../sim/"
set test_dir "../../src/test/"
set include_dir "../../include"
set target_period 10
set inp_width 3
set wgt_width 3
set acc_width 5
set out_width 3
set batch 1
set block_out 4
set block_in 4
set uop_buff_size 15
set inp_buff_size 15
set wgt_buff_size 15
set acc_buff_size 17
set out_buff_size 15
}
# C define flags to pass to compiler
set cflags "-I $include_dir -I $include_dir/hardware/hls \
-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \
-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \
-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \
-DLOG_UOP_BUFF_SIZE=$uop_buff_size -DLOG_INP_BUFF_SIZE=$inp_buff_size \
-DLOG_WGT_BUFF_SIZE=$wgt_buff_size -DLOG_ACC_BUFF_SIZE=$acc_buff_size \
-DLOG_OUT_BUFF_SIZE=$out_buff_size"
# Initializes the HLS design and sets HLS pragmas for memory partitioning.
# This is necessary because of a Vivado restriction that doesn't allow for
# buses wider than 1024 bits.
proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
# Set device number
set_part {xc7z020clg484-1}
# Set the clock frequency
create_clock -period $per -name default
# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/1024)
set inp_partition_factor [expr {(1 << ($inp_width + $block_in + $batch)) / 1024}]
if {$inp_partition_factor == 0} {
set_directive_array_reshape -type complete -dim 2 "load" inp_mem
set_directive_array_reshape -type complete -dim 2 "compute" inp_mem
} else {
# Set input reshaping factor below to (1024/INP_VECTOR_WIDTH)
set inp_reshape_factor [expr {1024 / (1 << ($inp_width + $block_in))}]
set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "load" inp_mem
set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "compute" inp_mem
set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem
set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem
}
# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/1024)
set wgt_partition_factor [expr {(1 << ($wgt_width + $block_in + $block_out)) / 1024}]
if {$wgt_partition_factor == 0} {
set_directive_array_reshape -type complete -dim 2 "load" wgt_mem
set_directive_array_reshape -type complete -dim 2 "compute" wgt_mem
} else {
# Set weight reshaping factor below to (1024/WGT_VECTOR_WIDTH)
set wgt_reshape_factor [expr {1024 / (1 << ($wgt_width + $block_in))}]
set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "load" wgt_mem
set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "compute" wgt_mem
set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem
set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem
}
# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/1024)
set out_partition_factor [expr {(1 << ($out_width + $block_out + $batch)) / 1024}]
if {$out_partition_factor == 0} {
set_directive_array_reshape -type complete -dim 2 "compute" out_mem
set_directive_array_reshape -type complete -dim 2 "store" out_mem
} else {
# Set output reshaping factor below to (1024/OUT_VECTOR_WIDTH)
set out_reshape_factor [expr {1024 / (1 << ($out_width + $block_out))}]
set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "compute" out_mem
set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "store" out_mem
set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "compute" out_mem
set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem
}
}
# HLS behavioral sim
open_project vta_sim
set_top vta
add_files $src_dir/vta.cc -cflags $cflags
add_files -tb $sim_dir/vta_test.cc -cflags $cflags
add_files -tb $test_dir/vta_test_lib.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csim_design -clean
close_project
# Generate fetch stage
open_project vta_fetch
set_top fetch
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
export_design -format ip_catalog
close_project
# Generate load stage
open_project vta_load
set_top load
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
export_design -format ip_catalog
close_project
# Generate compute stage
open_project vta_compute
set_top compute
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
export_design -format ip_catalog
close_project
# Generate store stage
open_project vta_store
set_top store
add_files $src_dir/vta.cc -cflags $cflags
open_solution "solution0"
init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
csynth_design
export_design -format ip_catalog
close_project
exit
#
# Copyright (c) 2018 by Contributors
# file: hsi.tcl
# brief: Driver generation script for ARMv7 driver libraries.
#
open_hw_design export/vta.hdf
create_sw_design swdesign -proc ps7_cortexa9_0 -os standalone
generate_bsp -dir bsp
exit
This diff is collapsed.
/*!
* Copyright (c) 2018 by Contributors
* \file vta_test.cpp
* \brief Simulation tests for the VTA design.
*/
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include "vta.h"
#include "vta_test_lib.h"
int main(void)
{
#if DEBUG==1
printParameters();
#endif
// Buffer indexing
assert(LOG_ACC_BUFF_DEPTH>=LOG_INP_BUFF_DEPTH);
// Micro op bound
assert(UOP_GEM_3_1<UOP_WIDTH);
assert(UOP_ALU_3_1<UOP_WIDTH);
// Instruction alignment checks
assert(INSN_MEM_7_1<INSN_MEM_8_0);
assert(INSN_GEM_8_1<INSN_GEM_9_0);
// Instruction bounds
assert(INSN_MEM_E_1<INS_WIDTH);
assert(INSN_GEM_E_1<INS_WIDTH);
assert(INSN_ALU_F_1<INS_WIDTH);
int status = 0;
// Run ALU test (vector-scalar operators)
status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
// Run ALU test (vector-vector operators)
status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
// Run blocked GEMM test
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
return status;
}
\ No newline at end of file
/*!
* Copyright (c) 2018 by Contributors
* \file vta.h
* \brief Type definitions and prototype for VTA HLS design.
*/
#ifndef VTA_MAIN_H_
#define VTA_MAIN_H_
#include <assert.h>
#include <ap_axi_sdata.h>
#include <ap_int.h>
#include <hls_stream.h>
#include "vta_typedefs.h"
#include "vta_params.h"
/*!
* \brief Fetch module.
* Reads in \a insn_count instructions via DMA and pushes them to the
* appropriate load, gemm or store queue.
* \param insns Instruction data base address in DRAM. AXI-4 master port.
* \param insn_count Total instruction count. AXI-lite memory mapped register.
* \param load_queue Load instruction queue. AXI-stream FIFO.
* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
* \param store_queue Store instruction queue. AXI-stream FIFO.
*/
void fetch (
uint32_t insn_count,
volatile insn_T *insns,
hls::stream<insn_T> &load_queue,
hls::stream<insn_T> &gemm_queue,
hls::stream<insn_T> &store_queue);
/*!
* \brief Load module.
* Reads in load instructions from the load queue, and performs appropriate
* DMA load operation to the \a wgt_mem and \a inp_mem SRAM buffers from DRAM.
* Updates dependence queues accordingly.
* \param inputs Input data base address in DRAM. AXI-4 master port.
* \param weights Weight data base address in DRAM. AXI-4 master port.
* \param load_queue Load instruction queue. AXI-stream FIFO.
* \param g2l_dep_queue Dependence queue from GEMM to load stage.
* AXI-stream FIFO.
* \param l2g_dep_queue Dependence queue from load to GEMM stage.
* AXI-stream FIFO.
* \param inp_mem Local input SRAM buffer. Write only single port BRAM.
* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
*/
void load (
volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights,
hls::stream<insn_T> &load_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &l2g_dep_queue,
inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT]
);
/*!
* \brief Compute module.
* Reads in GEMM instructions from the gemm queue, and performs appropriate
* GEMM/ALU instructions. Reads in data from the \a wgt_mem and \a inp_mem,
* and writes computation results into the \a out_mem. Updates dependence
* queues accordingly.
* \param done Signal that indicates that VLA is done. AXI-lite memory mapped
* register.
* \param uops Micro-op data base address in DRAM. AXI-4 master port.
* \param biases Bias data base address in DRAM. AXI-4 master port.
* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
* \param l2g_dep_queue Dependence queue from load to gemm stage.
* AXI-stream FIFO.
* \param s2g_dep_queue Dependence queue from store to gemm stage.
* AXI-stream FIFO.
* \param g2l_dep_queue Dependence queue from gemm to load stage.
* AXI-stream FIFO.
* \param g2s_dep_queue Dependence queue from gemm to store stage.
* AXI-stream FIFO.
* \param inp_mem Local input SRAM buffer. Read only single port BRAM.
* \param wgt_mem Local weight SRAM buffer. Read only single port BRAM.
* \param out_mem Local output SRAM buffer. Write only single port BRAM.
*/
void compute (
volatile uint32_t &done,
volatile uop_T *uops,
volatile acc_vec_T *biases,
hls::stream<insn_T> &gemm_queue,
hls::stream<bool> &l2g_dep_queue,
hls::stream<bool> &s2g_dep_queue,
hls::stream<bool> &g2l_dep_queue,
hls::stream<bool> &g2s_dep_queue,
out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT],
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
);
/*!
* \brief Store module.
* Reads in store instructions from the store queue, and performs appropriate
* store instructions from the output buffer in SRAM to DRAM. Updates dependence
* queues accordingly.
* \param outputs Output data base address in DRAM. AXI-4 master port.
* \param store_queue Store instruction queue. AXI-stream FIFO.
* \param g2s_dep_queue Dependence queue from gemm to store stage.
* AXI-stream FIFO.
* \param s2g_dep_queue Dependence queue from store to gemm stage.
* AXI-stream FIFO.
* \param out_mem Local output SRAM buffer. Read only single port BRAM.
*/
void store (
volatile out_vec_T *outputs,
hls::stream<insn_T> &store_queue,
hls::stream<bool> &g2s_dep_queue,
hls::stream<bool> &s2g_dep_queue,
out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
);
/*!
* \brief VTA wrapper for simulation purpose only.
* Orchestrates dataflow execution of the fetch, load, GEMM and store stages.
* \param insn_count Total instruction count. AXI-lite memory mapped register.
* \param insns Instruction data base address in DRAM. AXI-4 master port.
* \param uops Micro-op data base address in DRAM. AXI-4 master port.
* \param inputs Input data base address in DRAM. AXI-4 master port.
* \param weights Weight data base address in DRAM. AXI-4 master port.
* \param biases Bias data base address in DRAM. AXI-4 master port.
* \param outputs Output data base address in DRAM. AXI-4 master port.
*/
void vta (
uint32_t insn_count,
volatile insn_T *insns,
volatile uop_T *uops,
volatile inp_vec_T *inputs,
volatile wgt_vec_T *weights,
volatile acc_vec_T *biases,
volatile out_vec_T *outputs);
#endif // VTA_MAIN_H_
\ No newline at end of file
/*!
* Copyright (c) 2018 by Contributors
* \file vta_typedefs.h
* \brief Type definitions for VTA HLS design.
*/
#ifndef VTA_TYPEDEFS_H_
#define VTA_TYPEDEFS_H_
#include <assert.h>
#include <ap_axi_sdata.h>
#include <ap_int.h>
#include <hls_stream.h>
#include "vta_params.h"
/* \typedef uop_T Micro-op datatype*/
typedef ap_uint<UOP_WIDTH> uop_T;
/* \typedef inp_T Input datatype*/
typedef ap_int<INP_WIDTH> inp_T;
/* \typedef wgt_T Weight datatype*/
typedef ap_int<WGT_WIDTH> wgt_T;
/* \typedef out_T Output datatype*/
typedef ap_int<OUT_WIDTH> out_T;
/* \typedef acc_T Accumulator datatype*/
typedef ap_int<ACC_WIDTH> acc_T;
/* \typedef mul_T Multiplier output datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
/* \typedef sum_T GEMM accumulator datatype*/
typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
/* \typedef inp_vec_T Input vector datatype*/
typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
/* \typedef wgt_vec_T Weight vector datatype*/
typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
/* \typedef acc_vec_T Accumulator vector datatype*/
typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
/* \typedef out_vec_T Output vector datatype*/
typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
/* \typedef uop_idx_T Micro-op SRAM index datatype*/
typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
/* \typedef inp_idx_T Input SRAM index datatype*/
typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
/* \typedef wgt_idx_T Weight SRAM index datatype*/
typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
/* \typedef acc_idx_T Accumulator SRAM index datatype*/
typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
/* \typedef opcode_T Opcode datatype*/
typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
/* \typedef insn_T Instruction datatype*/
typedef ap_uint<INS_WIDTH> insn_T;
/* \typedef loop_T Loop bound datatype*/
typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
/* \typedef memop_id_T Memory operation ID datatype*/
typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
/* \typedef memop_sram_T Memory operation SRAM index datatype*/
typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
/* \typedef memop_dram_T Memory operation DRAM index datatype*/
typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
/* \typedef memop_size_T Memory operation range datatype*/
typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
/* \typedef memop_stride_T Memory operation stride datatype*/
typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
/* \typedef memop_pad_T Memory operation pad width datatype*/
typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
/* \typedef aluop_opcode_T ALU operation opcode datatype*/
typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
/* \typedef aluop_opcode_T ALU operation immediate datatype*/
typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
#endif // VTA_TYPEDEFS_H_
This diff is collapsed.
/*!
* Copyright (c) 2018 by Contributors
* \file vta_pynq_driver.h
* \brief VTA driver for Pynq board.
*/
#ifndef VTA_PYNQ_DRIVER_H_
#define VTA_PYNQ_DRIVER_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <assert.h>
#include <fcntl.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#ifdef __arm__
#include "libxlnk_cma.h"
#else
void* cma_alloc(size_t size, int cached);
void cma_free(void* buf);
uint32_t cma_get_phy_addr(void* buf);
void xlnkFlushCache(void* buf, int size);
void xlnkInvalidateCache(void* buf, int size);
#endif
/*! \brief VTA command handle */
typedef void * VTAHandle;
/*! \brief DMA command handle */
typedef struct {
/*! \brief Register map to the AXI DMA control registers*/
void *dma_register_map;
/*! \brief Transmit data descriptor*/
void *mm2s_descriptor_register_map;
/*! \brief Receive data descriptor*/
void *s2mm_descriptor_register_map;
/*! \brief Transmit data descriptor physical address*/
uint32_t mm2s_descriptor_phy;
/*! \brief Receive data descriptor physical address*/
uint32_t s2mm_descriptor_phy;
/*! \brief Descriptor size */
uint32_t descriptor_size;
/*! \brief Transaction count for tx channel */
uint32_t mm2s_count;
/*! \brief Transaction count for rx channel */
uint32_t s2mm_count;
/*! \brief Multi-channel mode enable */
int multichannel_en;
} DMAHandle;
/*! \brief partial bitstream status file path */
#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
/*! \brief bitstream destination file path */
#define BS_XDEVCFG "/dev/xdevcfg"
/*! \brief Path to /dev/mem */
#define DEV_MEM_PATH "/dev/mem"
/*! \brief MMIO driver constant */
#define MMIO_WORD_LENGTH 4
/*! \brief MMIO driver constant */
#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
/*! \brief VTA configuration register address range */
#define VTA_RANGE 0x100
/*! \brief VTA configuration register start value */
#define VTA_START 0x1
/*! \brief VTA configuration register auto-restart value */
#define VTA_AUTORESTART 0x81
/*! \brief VTA configuration register done value */
#define VTA_DONE 0x1
/*! \brief VTA fetch stage configuration register address
* from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_FETCH_ADDR 0x43C00000
/*! \brief VTA compute stage configuration register address
* from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_COMPUTE_ADDR 0x43C10000
/*! \brief VTA compute stage configuration register address
* from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_LOAD_ADDR 0x43C20000
/*! \brief VTA store stage configuration register address
* from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
* in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
*/
#define VTA_STORE_ADDR 0x43C30000
/*! \brief Memory management constants with libxlnk_cma */
#define CACHED 1
/*! \brief Memory management constants with libxlnk_cma */
#define NOT_CACHED 0
/*! \brief log2 of SDS buffer size limit */
#define LOG_MAX_XFER 22
/*! \brief SDS buffer size limit */
#define MAX_XFER (1<<LOG_MAX_XFER)
/*!
* \brief Returns a memory map to FPGA configuration registers.
* \param addr The base physical address of the configuration registers.
* \param length The size of the memory mapped region in bytes.
* \return A pointer to the memory mapped region.
*/
void *MapRegister(unsigned addr, size_t length);
/*!
* \brief Deletes the configuration register memory map.
* \param vta The memory mapped region.
* \param length The size of the memory mapped region in bytes.
*/
void UnmapRegister(void *vta, size_t length);
/*!
* \brief Writes to a memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to write to.
* \param val The value to be written to the memory mapped register.
*/
void WriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
/*!
* \brief Reads from the memory mapped configuration register.
* \param vta_base The handle to the memory mapped configuration registers.
* \param offset The offset of the register to read from.
* \return The value read from the memory mapped register.
*/
unsigned ReadMappedReg(VTAHandle vta_base, unsigned offset);
/*!
* \brief Programming the bit stream on the FPGA.
* \param bitstream The path to the bit stream file.
*/
void ProgramVTA(const char* bitstream);
#ifdef __cplusplus
}
#endif
#endif // VTA_PYNQ_DRIVER_H_
\ No newline at end of file
/*!
* Copyright (c) 2018 by Contributors
* \file vta_test_lib.cpp
* \brief Test library for the VTA design simulation and driver tests.
*/
#ifndef VTA_TESTLIB_H_
#define VTA_TESTLIB_H_
#include "vta_params.h"
#include <assert.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#ifdef NO_SIM
#include "vta_pynq_driver.h"
typedef uint64_t axi_T;
typedef uint32_t uop_T;
typedef int8_t wgt_T;
typedef int8_t inp_T;
typedef int32_t acc_T;
uint64_t vta (
uint32_t insn_count,
VTAGenericInsn *insns,
VTAUop *uops,
inp_T *inputs,
wgt_T *weights,
acc_T *biases,
inp_T *outputs);
#else //NO_SIM
#include "vta.h"
#include "vta_typedefs.h"
#endif //NO_SIM
/*!
* \brief Returns opcode string.
* \param opcode Opcode parameter (defined in vta_defines.h).
* \param use_imm Boolean that indicates if the operation uses an immediate value.
* \return The opcode string.
*/
const char* getOpcodeString(int opcode, bool use_imm);
/*!
* \brief Performs buffer data packing and tiling.
* \param dst Pointer to the packed, and tiled destination 1D array (flattened).
* \param src Pointer to the unpacked source 2D array.
* \param y_size Number of rows.
* \param x_size Number of columns.
* \param y_block Inner tiling along row dimension.
* \param x_block Inner tiling along column dimension.
*/
template <typename T, int T_WIDTH>
void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block);
/*!
* \brief Performs buffer data unpacking.
* \param dst Pointer to the unpacked destination 2D array.
* \param src Pointer to the packed, and tiled source 1D array (flattened).
* \param y_size Number of rows.
* \param x_size Number of columns.
* \param y_block Inner tiling along row dimension.
* \param x_block Inner tiling along column dimension.
*/
template <typename T, int T_WIDTH>
void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block);
/*!
* \brief Allocates and initializes a 2D array in the heap.
* \param rows Number of rows.
* \param cols Number of columns.
* \return Pointer to the 2D array.
*/
template <typename T, int T_WIDTH>
T ** allocInit2dArray(int rows, int cols);
/*!
* \brief Allocates a 2D array in the heap.
* \param rows Number of rows.
* \param cols Number of columns.
* \return Pointer to the 2D array.
*/
template <typename T>
T ** alloc2dArray(int rows, int cols);
/*!
* \brief Frees a 2D array.
* \param array Pointer to the 2D array to be freed.
* \param rows Number of rows.
* \param cols Number of columns.
*/
template <typename T>
void free2dArray(T **array, int rows, int cols);
/*!
* \brief Allocates a 3D array in the heap.
* \param rows Number of rows (dim 0).
* \param cols Number of columns (dim 1).
* \param depth Depth of the array (dim 2).
* \return Pointer to the 3D array.
*/
template <typename T>
T *** alloc3dArray(int rows, int cols, int depth);
/*!
* \brief Frees a 3D array.
* \param array Pointer to the 3D array.
* \param rows Number of rows (dim 0).
* \param cols Number of columns (dim 1).
* \param depth Depth of the array (dim 2).
*/
template <typename T>
void free3dArray(T *** array, int rows, int cols, int depth);
/*!
* \brief Performs memory allocation in a physically contiguous region of memory.
* \param num_bytes Size of the buffer in bytes.
* \return Pointer to the allocated buffer.
*/
void * allocBuffer(size_t num_bytes);
/*!
* \brief Frees buffer allocated in a physically contiguous region of memory.
* \param buffer Pointer to the buffer to free.
*/
void freeBuffer(void * buffer);
/*!
* \brief Returns a VTA reset instruction on a 2D patch of the register file.
* \param type On-chip memory target.
* \param sram_offset Offset in SRAM.
* \param y_size Number of rows to reset (y axis).
* \param x_size Number of elements per row to reset (x axis).
* \param x_stride Stride along the x axis.
* \param pop_prev_dep Pop dependence from previous stage.
* \param pop_next_dep Pop dependence from next stage.
* \param push_prev_dep Push dependence to previous stage.
* \param push_next_dep Push dependence to next stage.
* \return A VTAGenericInsn for a reset op.
*/
VTAGenericInsn reset2DInsn(int type, int sram_offset, int y_size, int x_size, int x_stride,
int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
/*!
* \brief Returns a VTA 2D load or store instruction.
* \param opcode Type of operation.
* \param type On-chip memory target.
* \param sram_offset Offset in SRAM.
* \param dram_offset Offset in DRAM.
* \param y_size Number of rows to load/store (y axis).
* \param x_size Number of elements per row to load/store (x axis).
* \param x_stride Stride along the x axis.
* \param y_pad Padding along the y axis.
* \param x_pad Padding along the x axis.
* \param pop_prev_dep Pop dependence from previous stage.
* \param pop_next_dep Pop dependence from next stage.
* \param push_prev_dep Push dependence to previous stage.
* \param push_next_dep Push dependence to next stage.
* \return A VTAGenericInsn for a 2D load or store op.
*/
VTAGenericInsn get2DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset,
int y_size, int x_size, int x_stride, int y_pad, int x_pad, int pop_prev_dep, int pop_next_dep,
int push_prev_dep, int push_next_dep);
/*!
* \brief Returns a VTA 1D load or store instruction.
* \param opcode Type of operation.
* \param type On-chip memory target.
* \param sram_offset Offset in SRAM.
* \param dram_offset Offset in DRAM.
* \param size Number of elements to load/store.
* \param pop_prev_dep Pop dependence from previous stage.
* \param pop_next_dep Pop dependence from next stage.
* \param push_prev_dep Push dependence to previous stage.
* \param push_next_dep Push dependence to next stage.
* \return A VTAGenericInsn for a 1D load or store op.
*/
VTAGenericInsn get1DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset, int size,
int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
/*!
* \brief Returns a VTA matrix multiplication instruction of size (a, b) x (b, c).
* \param uop_offset Offset of the micro-op in SRAM.
* \param batch Batch size (a).
* \param in_feat Input features (b).
* \param out_feat Output features (c).
* \param uop_compression Apply micro-op compression.
* \param pop_prev_dep Pop dependence from previous stage.
* \param pop_next_dep Pop dependence from next stage.
* \param push_prev_dep Push dependence to previous stage.
* \param push_next_dep Push dependence to next stage.
* \return A VTAGenericInsn for a GEMM op.
*/
VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat,
bool uop_compression, int pop_prev_dep, int pop_next_dep, int push_prev_dep,
int push_next_dep);
/*!
* \brief Returns a VTA ALU instruction for map type operation.
* \param opcode Opcode of the ALU instruction.
* \param use_imm Use immediate.
* \param imm Immediate value (int16).
* \param vector_size Vector size of the ALU operation size.
* \param uop_compression Apply micro-op compression.
* \param pop_prev_dep Pop dependence from previous stage.
* \param pop_next_dep Pop dependence from next stage.
* \param push_prev_dep Push dependence to previous stage.
* \param push_next_dep Push dependence to next stage.
* \return A VTAGenericInsn for a ALU op.
*/
VTAGenericInsn getALUInsn(int opcode, bool use_imm, int imm, int vector_size, bool uop_compression,
int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
/*!
* \brief Returns a VTA finish instruction.
* \param pop_prev Pop dependence from previous stage.
* \param pop_next Pop dependence from next stage.
* \return A VTAGenericInsn for a finish op.
*/
VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next);
/*!
* \brief Returns an allocated buffer of VTA micro-ops to implement a copy operation.
* \param y_size Number of rows to load/store (y axis).
* \param x_size Number of elements per row to load/store (x axis).
* \param uop_compression Apply micro-op compression.
* \return A VTAUop pointer to an allocated micro-op buffer.
*/
VTAUop * getCopyUops(int y_size, int x_size, int uop_compression);
/*!
* \brief Returns an allocated buffer of VTA micro-ops to implement a matrix multiplication
* of size (a, b) x (b, c).
* \param batch Batch size (a).
* \param in_feat Input features (b).
* \param out_feat Output features (c).
* \param uop_compression Apply micro-op compression.
* \param multi_threaded Generate micro-ops for two virtual execution threads.
* \return A VTAUop pointer to an allocated micro-op buffer.
*/
VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
bool multi_threaded);
/*!
* \brief Returns an allocated buffer of VTA micro-ops to implement a vector-vector map operation.
* \param vector_size Vector size.
* \param uop_compression Apply micro-op compression.
* \return A VTAUop pointer to an allocated micro-op buffer.
*/
VTAUop * getMapALUUops(int vector_size, bool uop_compression);
/*!
* \brief Print out parameters of the VTA design (for debugging purposes).
*/
void printParameters();
/*!
* \brief Print out instruction information (for debugging purposes).
* \param num_insn Number of instructions.
* \param insns Pointer to the instruction buffer.
*/
void printInstruction(int num_insn, VTAGenericInsn *insns);
/*!
* \brief Print out micro-op information (for debugging purposes).
* \param num_insn Number of micro-ops.
* \param insns Pointer to the micro-op buffer.
*/
void printMicroOp(int num_uop, VTAUop *uops);
/*!
* \brief VTA ALU unit test.
* \param opcode The ALU opcode.
* \param use_imm Use immediate.
* \param batch Batch size.
* \param vector_size Vector length of the ALU operation.
* \param uop_compression Apply micro-op compression.
* \return Number of errors from the test run.
*/
int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression);
/*!
* \brief VTA blocked GEMM unit test.
* \param batch Batch size.
* \param channels Channel width.
* \param block Blocking size.
* \param uop_compression Apply micro-op compression.
* \return Number of errors from the test run.
*/
int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
int virtual_threads);
#endif // VTA_TESTLIB_H_
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment