hardware compilation flow, and driver tests

47001850 · Thierry Moreau · Tianqi Chen · b8d8e5b6 · 47001850 · 47001850
Commit 47001850 authored 7 years ago by Thierry Moreau Committed by Tianqi Chen 6 years ago
--- a/vta/README.md
+++ b/vta/README.md
-# vta
 Open Hardware/Software Stack for Vertical Deep Learning System Optimization
+==============================================
+
+[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
+
+VTA is an open hardware/software co-design stack for deep learning systems systems.
+It provides a customizable hardware accelerator template for deep learning inference workloads,
+combined with a fully functional compiler stack built with TVM.
+
+License
+-------
+© Contributors, 2018. Licensed under an [Apache-2.0](https://github.com/tmoreau89/vta/blob/master/LICENSE) license.
--- a/vta/docs/.gitignore
+++ b/vta/docs/.gitignore
+doxygen
--- a/vta/docs/Doxyfile
+++ b/vta/docs/Doxyfile
--- a/vta/hardware/vivado/.gitignore
+++ b/vta/hardware/vivado/.gitignore
+build
--- a/vta/hardware/vivado/Makefile
+++ b/vta/hardware/vivado/Makefile
+# Directories
+ROOTDIR = $(CURDIR)
+BUILD_DIR = $(ROOTDIR)/build
+SCRIPT_DIR = $(ROOTDIR)/scripts
+SRC_DIR = $(ROOTDIR)/../../src/hardware/hls
+SIM_DIR = $(ROOTDIR)/sim
+TEST_DIR = $(ROOTDIR)/../../src/test
+INCLUDE_DIR = $(ROOTDIR)/../../include
+
+# Executables
+VIVADO_HLS = vivado_hls
+VIVADO = vivado
+HSI = hsi
+
+# Build parameters:
+#  Number of threads during compilation
+NUM_THREADS = 8
+#  Target Frequency
+CLOCK_FREQ = 100
+#  Log of input width in bits
+LOG_INP_WIDTH = 3
+#  Log of weight width in bits
+LOG_WGT_WIDTH = 3
+#  Log of accum width in bits
+LOG_ACC_WIDTH = 5
+#  Log of output width in bits
+LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
+#  Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
+LOG_BATCH = 0
+#  Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
+LOG_IN_BLOCK = 4
+#  Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
+LOG_OUT_BLOCK = 4
+#  Log of uop buffer size in Bytes
+LOG_UOP_BUFF_SIZE = 15
+#  Log of inp buffer size in Bytes
+LOG_INP_BUFF_SIZE = 15
+#  Log of wgt buffer size in Bytes
+LOG_WGT_BUFF_SIZE = 15
+#  Log of acc buffer size in Bytes
+LOG_ACC_BUFF_SIZE = 17
+#  Log of out buffer size in Bytes
+LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
+
+# Derived parameter
+#  Input width in bits
+INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
+#  Weight width in bits
+WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
+#  Output width in bits
+OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
+#  Tensor batch size
+BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
+#  Tensor outer block size
+IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_IN_BLOCK) ))" )
+#  Tensor inner block size
+OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_OUT_BLOCK) ))" )
+#  Uop buffer size in Bytes
+UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
+#  Inp buffer size in Bytes
+INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
+#  Wgt buffer size in Bytes
+WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
+#  Acc buffer size in Bytes
+ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
+#  Out buffer size in Bytes
+OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
+
+# Derive clock target period
+TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" )
+
+# Derive config name
+CONF = \
+	$(BATCH)x$(IN_BLOCK)x$(OUT_BLOCK)_$(INP_WIDTH)bx$(WGT_WIDTH)b_$(CLOCK_FREQ)MHz_$(TARGET_PER)ns
+IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
+HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
+
+.PHONY: all ip bit driver clean
+
+all: driver
+
+ip: 
+	mkdir -p $(IP_BUILD_PATH)
+	cd $(IP_BUILD_PATH) && \
+		$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
+			-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
+			$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \
+			$(LOG_BATCH) $(LOG_OUT_BLOCK) $(LOG_IN_BLOCK) \
+			$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \
+			$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE)
+
+bit: ip
+	mkdir -p $(HW_BUILD_PATH)
+	cd $(HW_BUILD_PATH) && \
+		$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
+		-tclargs $(IP_BUILD_PATH) $(NUM_THREADS) $(CLOCK_FREQ) \
+		$(INP_WIDTH) $(WGT_WIDTH) $(OUT_WIDTH) \
+		$(BATCH) $(IN_BLOCK) $(OUT_BLOCK) \
+		$(INP_BUFF_SIZE) $(WGT_BUFF_SIZE) $(OUT_BUFF_SIZE)
+
+driver: bit
+	cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
+	cd $(HW_BUILD_PATH)/bsp && make
+
+clean:
+	rm -rf build
\ No newline at end of file
--- a/vta/hardware/vivado/scripts/hls.tcl
+++ b/vta/hardware/vivado/scripts/hls.tcl
+#
+#  Copyright (c) 2018 by Contributors
+#  file: hls.tcl
+#  brief: HLS generation script.
+#
+
+# Command line arguments:
+# Arg 1: path to design sources
+# Arg 2: path to sim sources
+# Arg 3: path to test sources
+# Arg 4: path to include sources
+# Arg 5: target clock period
+# Arg 6: input type width (log)
+# Arg 7: weight type width (log)
+# Arg 8: accum type width (log)
+# Arg 9: output type width (log)
+# Arg 10: batch size (log)
+# Arg 11: in block size (log)
+# Arg 12: out block size (log)
+# Arg 13: uop buffer size in B (log)
+# Arg 14: inp buffer size in B (log)
+# Arg 15: wgt buffer size in B (log)
+# Arg 16: acc buffer size in B (log)
+# Arg 17: out buffer size in B (log)
+
+if { [llength $argv] eq 19 } {
+	set src_dir [lindex $argv 2]
+	set sim_dir [lindex $argv 3]
+	set test_dir [lindex $argv 4]
+	set include_dir [lindex $argv 5]
+	set target_period [lindex $argv 6]
+	set inp_width [lindex $argv 7]
+	set wgt_width [lindex $argv 8]
+	set acc_width [lindex $argv 9]
+	set out_width [lindex $argv 10]
+	set batch [lindex $argv 11]
+	set block_in [lindex $argv 12]
+	set block_out [lindex $argv 13]
+	set uop_buff_size [lindex $argv 14]
+	set inp_buff_size [lindex $argv 15]
+	set wgt_buff_size [lindex $argv 16]
+	set acc_buff_size [lindex $argv 17]
+	set out_buff_size [lindex $argv 18]
+} else {
+	set src_dir "../src/"
+	set sim_dir "../sim/"
+	set test_dir "../../src/test/"
+	set include_dir "../../include"
+	set target_period 10
+	set inp_width 3
+	set wgt_width 3
+	set acc_width 5
+	set out_width 3
+	set batch 1
+	set block_out 4
+	set block_in 4
+	set uop_buff_size 15
+	set inp_buff_size 15
+	set wgt_buff_size 15
+	set acc_buff_size 17
+	set out_buff_size 15
+}
+
+# C define flags to pass to compiler
+set cflags "-I $include_dir -I $include_dir/hardware/hls \
+	-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \
+	-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \
+	-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \
+	-DLOG_UOP_BUFF_SIZE=$uop_buff_size -DLOG_INP_BUFF_SIZE=$inp_buff_size \
+	-DLOG_WGT_BUFF_SIZE=$wgt_buff_size -DLOG_ACC_BUFF_SIZE=$acc_buff_size \
+	-DLOG_OUT_BUFF_SIZE=$out_buff_size"
+
+# Initializes the HLS design and sets HLS pragmas for memory partitioning.
+# This is necessary because of a Vivado restriction that doesn't allow for
+# buses wider than 1024 bits.
+proc init_design {per inp_width wgt_width out_width batch block_in block_out} {
+
+	# Set device number
+	set_part {xc7z020clg484-1}
+
+	# Set the clock frequency
+	create_clock -period $per -name default
+
+	# Set input partition factor to (INP_VECTOR_WIDTH*BATCH/1024)
+	set inp_partition_factor [expr {(1 << ($inp_width + $block_in + $batch)) / 1024}]
+	if {$inp_partition_factor == 0} {
+		set_directive_array_reshape -type complete -dim 2 "load" inp_mem
+		set_directive_array_reshape -type complete -dim 2 "compute" inp_mem
+	} else {
+		# Set input reshaping factor below to (1024/INP_VECTOR_WIDTH)
+		set inp_reshape_factor [expr {1024 / (1 << ($inp_width + $block_in))}]
+		set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "load" inp_mem
+		set_directive_array_partition -type block -factor $inp_partition_factor -dim 2 "compute" inp_mem
+		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "load" inp_mem
+		set_directive_array_reshape -type block -factor $inp_reshape_factor -dim 2 "compute" inp_mem
+	}
+	# Set weight partition factor to (WGT_VECTOR_WIDTH*BLOCK_OUT/1024)
+	set wgt_partition_factor [expr {(1 << ($wgt_width + $block_in + $block_out)) / 1024}]
+	if {$wgt_partition_factor == 0} {
+		set_directive_array_reshape -type complete -dim 2 "load" wgt_mem
+		set_directive_array_reshape -type complete -dim 2 "compute" wgt_mem
+	} else {
+		# Set weight reshaping factor below to (1024/WGT_VECTOR_WIDTH)
+		set wgt_reshape_factor [expr {1024 / (1 << ($wgt_width + $block_in))}]
+		set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "load" wgt_mem
+		set_directive_array_partition -type block -factor $wgt_partition_factor -dim 2 "compute" wgt_mem
+		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "load" wgt_mem
+		set_directive_array_reshape -type block -factor $wgt_reshape_factor -dim 2 "compute" wgt_mem
+	}
+	# Set output partition factor to (OUT_VECTOR_WIDTH*BATCH/1024)
+	set out_partition_factor [expr {(1 << ($out_width + $block_out + $batch)) / 1024}]
+	if {$out_partition_factor == 0} {
+		set_directive_array_reshape -type complete -dim 2 "compute" out_mem
+		set_directive_array_reshape -type complete -dim 2 "store" out_mem
+	} else {
+		# Set output reshaping factor below to (1024/OUT_VECTOR_WIDTH)
+		set out_reshape_factor [expr {1024 / (1 << ($out_width + $block_out))}]
+		set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "compute" out_mem
+		set_directive_array_partition -type block -factor $out_partition_factor -dim 2 "store" out_mem
+		set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "compute" out_mem
+		set_directive_array_reshape -type block -factor $out_reshape_factor -dim 2 "store" out_mem
+	}
+}
+
+# HLS behavioral sim
+open_project vta_sim
+set_top vta
+add_files $src_dir/vta.cc -cflags $cflags
+add_files -tb $sim_dir/vta_test.cc -cflags $cflags
+add_files -tb $test_dir/vta_test_lib.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csim_design -clean
+close_project
+
+# Generate fetch stage
+open_project vta_fetch
+set_top fetch
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csynth_design
+export_design -format ip_catalog
+close_project
+
+# Generate load stage
+open_project vta_load
+set_top load
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csynth_design
+export_design -format ip_catalog
+close_project
+
+# Generate compute stage
+open_project vta_compute
+set_top compute
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csynth_design
+export_design -format ip_catalog
+close_project
+
+# Generate store stage
+open_project vta_store
+set_top store
+add_files $src_dir/vta.cc -cflags $cflags
+open_solution "solution0"
+init_design $target_period $inp_width $wgt_width $out_width $batch $block_in $block_out
+csynth_design
+export_design -format ip_catalog
+close_project
+
+exit
+
--- a/vta/hardware/vivado/scripts/hsi.tcl
+++ b/vta/hardware/vivado/scripts/hsi.tcl
+#
+#  Copyright (c) 2018 by Contributors
+#  file: hsi.tcl
+#  brief: Driver generation script for ARMv7 driver libraries.
+#
+
+open_hw_design export/vta.hdf
+create_sw_design swdesign -proc ps7_cortexa9_0 -os standalone
+generate_bsp -dir bsp
+
+exit
--- a/vta/hardware/vivado/scripts/vivado.tcl
+++ b/vta/hardware/vivado/scripts/vivado.tcl
--- a/vta/hardware/vivado/sim/vta_test.cc
+++ b/vta/hardware/vivado/sim/vta_test.cc
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_test.cpp
+ * \brief Simulation tests for the VTA design.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+
+#include "vta.h"
+#include "vta_test_lib.h"
+
+int main(void)
+{
+
+#if DEBUG==1
+    printParameters();
+#endif
+
+    // Buffer indexing
+    assert(LOG_ACC_BUFF_DEPTH>=LOG_INP_BUFF_DEPTH);
+    // Micro op bound
+    assert(UOP_GEM_3_1<UOP_WIDTH);
+    assert(UOP_ALU_3_1<UOP_WIDTH);
+    // Instruction alignment checks
+    assert(INSN_MEM_7_1<INSN_MEM_8_0);
+    assert(INSN_GEM_8_1<INSN_GEM_9_0);
+    // Instruction bounds
+    assert(INSN_MEM_E_1<INS_WIDTH);
+    assert(INSN_GEM_E_1<INS_WIDTH);
+    assert(INSN_ALU_F_1<INS_WIDTH);
+
+    int status = 0;
+
+    // Run ALU test (vector-scalar operators)
+    status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
+
+    // Run ALU test (vector-vector operators)
+    status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
+    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
+    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
+
+    // Run blocked GEMM test
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
+    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
+
+    return status;
+
+}
\ No newline at end of file
--- a/vta/include/hardware/hls/vta.h
+++ b/vta/include/hardware/hls/vta.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta.h
+ * \brief Type definitions and prototype for VTA HLS design.
+ */
+#ifndef VTA_MAIN_H_
+#define VTA_MAIN_H_
+
+#include <assert.h>
+#include <ap_axi_sdata.h>
+#include <ap_int.h>
+#include <hls_stream.h>
+
+#include "vta_typedefs.h"
+#include "vta_params.h"
+
+/*!
+* \brief Fetch module.
+*   Reads in \a insn_count instructions via DMA and pushes them to the
+*   appropriate load, gemm or store queue.
+* \param insns Instruction data base address in DRAM. AXI-4 master port.
+* \param insn_count Total instruction count. AXI-lite memory mapped register.
+* \param load_queue Load instruction queue. AXI-stream FIFO.
+* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
+* \param store_queue Store instruction queue. AXI-stream FIFO.
+*/
+void fetch (
+  uint32_t insn_count,
+  volatile insn_T *insns,
+  hls::stream<insn_T> &load_queue,
+  hls::stream<insn_T> &gemm_queue,
+  hls::stream<insn_T> &store_queue);
+
+/*!
+* \brief Load module.
+*   Reads in load instructions from the load queue, and performs appropriate
+*   DMA load operation to the \a wgt_mem and \a inp_mem SRAM buffers from DRAM.
+*   Updates dependence queues accordingly.
+* \param inputs Input data base address in DRAM. AXI-4 master port.
+* \param weights Weight data base address in DRAM. AXI-4 master port.
+* \param load_queue Load instruction queue. AXI-stream FIFO.
+* \param g2l_dep_queue Dependence queue from GEMM to load stage.
+*   AXI-stream FIFO.
+* \param l2g_dep_queue Dependence queue from load to GEMM stage.
+*   AXI-stream FIFO.
+* \param inp_mem Local input SRAM buffer. Write only single port BRAM.
+* \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
+*/
+void load (
+  volatile inp_vec_T *inputs,
+  volatile wgt_vec_T *weights,
+  hls::stream<insn_T> &load_queue,
+  hls::stream<bool> &g2l_dep_queue,
+  hls::stream<bool> &l2g_dep_queue,
+  inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
+  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT]
+  );
+
+/*!
+* \brief Compute module.
+*   Reads in GEMM instructions from the gemm queue, and performs appropriate
+*   GEMM/ALU instructions. Reads in data from the \a wgt_mem and \a inp_mem,
+*   and writes computation results into the \a out_mem. Updates dependence
+*   queues accordingly.
+* \param done Signal that indicates that VLA is done.  AXI-lite memory mapped
+*   register.
+* \param uops Micro-op data base address in DRAM. AXI-4 master port.
+* \param biases Bias data base address in DRAM. AXI-4 master port.
+* \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
+* \param l2g_dep_queue Dependence queue from load to gemm stage.
+*   AXI-stream FIFO.
+* \param s2g_dep_queue Dependence queue from store to gemm stage.
+*   AXI-stream FIFO.
+* \param g2l_dep_queue Dependence queue from gemm to load stage.
+*   AXI-stream FIFO.
+* \param g2s_dep_queue Dependence queue from gemm to store stage.
+*   AXI-stream FIFO.
+* \param inp_mem Local input SRAM buffer. Read only single port BRAM.
+* \param wgt_mem Local weight SRAM buffer. Read only single port BRAM.
+* \param out_mem Local output SRAM buffer. Write only single port BRAM.
+*/
+void compute (
+  volatile uint32_t &done,
+  volatile uop_T *uops,
+  volatile acc_vec_T *biases,
+  hls::stream<insn_T> &gemm_queue,
+  hls::stream<bool> &l2g_dep_queue,
+  hls::stream<bool> &s2g_dep_queue,
+  hls::stream<bool> &g2l_dep_queue,
+  hls::stream<bool> &g2s_dep_queue,
+  out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
+  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT],
+  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
+  );
+
+/*!
+* \brief Store module.
+*   Reads in store instructions from the store queue, and performs appropriate
+*   store instructions from the output buffer in SRAM to DRAM. Updates dependence
+*   queues accordingly.
+* \param outputs Output data base address in DRAM. AXI-4 master port.
+* \param store_queue Store instruction queue. AXI-stream FIFO.
+* \param g2s_dep_queue Dependence queue from gemm to store stage.
+*   AXI-stream FIFO.
+* \param s2g_dep_queue Dependence queue from store to gemm stage.
+*   AXI-stream FIFO.
+* \param out_mem Local output SRAM buffer. Read only single port BRAM.
+*/
+void store (
+  volatile out_vec_T *outputs,
+  hls::stream<insn_T> &store_queue,
+  hls::stream<bool> &g2s_dep_queue,
+  hls::stream<bool> &s2g_dep_queue,
+  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
+  );
+
+/*!
+* \brief VTA wrapper for simulation purpose only.
+*   Orchestrates dataflow execution of the fetch, load, GEMM and store stages.
+* \param insn_count Total instruction count. AXI-lite memory mapped register.
+* \param insns Instruction data base address in DRAM. AXI-4 master port.
+* \param uops Micro-op data base address in DRAM. AXI-4 master port.
+* \param inputs Input data base address in DRAM. AXI-4 master port.
+* \param weights Weight data base address in DRAM. AXI-4 master port.
+* \param biases Bias data base address in DRAM. AXI-4 master port.
+* \param outputs Output data base address in DRAM. AXI-4 master port.
+*/
+void vta (
+  uint32_t insn_count,
+  volatile insn_T *insns,
+  volatile uop_T *uops,
+  volatile inp_vec_T *inputs,
+  volatile wgt_vec_T *weights,
+  volatile acc_vec_T *biases,
+  volatile out_vec_T *outputs);
+
+#endif  // VTA_MAIN_H_
\ No newline at end of file
--- a/vta/include/hardware/hls/vta_typedefs.h
+++ b/vta/include/hardware/hls/vta_typedefs.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_typedefs.h
+ * \brief Type definitions for VTA HLS design.
+ */
+#ifndef VTA_TYPEDEFS_H_
+#define VTA_TYPEDEFS_H_
+
+#include <assert.h>
+#include <ap_axi_sdata.h>
+#include <ap_int.h>
+#include <hls_stream.h>
+
+#include "vta_params.h"
+
+/* \typedef uop_T Micro-op datatype*/
+typedef ap_uint<UOP_WIDTH> uop_T;
+
+/* \typedef inp_T Input datatype*/
+typedef ap_int<INP_WIDTH> inp_T;
+
+/* \typedef wgt_T Weight datatype*/
+typedef ap_int<WGT_WIDTH> wgt_T;
+
+/* \typedef out_T Output datatype*/
+typedef ap_int<OUT_WIDTH> out_T;
+
+/* \typedef acc_T Accumulator datatype*/
+typedef ap_int<ACC_WIDTH> acc_T;
+
+/* \typedef mul_T Multiplier output datatype*/
+typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
+
+/* \typedef sum_T GEMM accumulator datatype*/
+typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
+
+/* \typedef inp_vec_T Input vector datatype*/
+typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
+
+/* \typedef wgt_vec_T Weight vector datatype*/
+typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
+
+/* \typedef acc_vec_T Accumulator vector datatype*/
+typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
+
+/* \typedef out_vec_T Output vector datatype*/
+typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
+
+/* \typedef uop_idx_T Micro-op SRAM index datatype*/
+typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
+
+/* \typedef inp_idx_T Input SRAM index datatype*/
+typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
+
+/* \typedef wgt_idx_T Weight SRAM index datatype*/
+typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
+
+/* \typedef acc_idx_T Accumulator SRAM index datatype*/
+typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
+
+/* \typedef opcode_T Opcode datatype*/
+typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
+
+/* \typedef insn_T Instruction datatype*/
+typedef ap_uint<INS_WIDTH> insn_T;
+
+/* \typedef loop_T Loop bound datatype*/
+typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
+
+/* \typedef memop_id_T Memory operation ID datatype*/
+typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
+
+/* \typedef memop_sram_T Memory operation SRAM index datatype*/
+typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
+
+/* \typedef memop_dram_T Memory operation DRAM index datatype*/
+typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
+
+/* \typedef memop_size_T Memory operation range datatype*/
+typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
+
+/* \typedef memop_stride_T Memory operation stride datatype*/
+typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
+
+/* \typedef memop_pad_T Memory operation pad width datatype*/
+typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
+
+/* \typedef aluop_opcode_T ALU operation opcode datatype*/
+typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
+
+/* \typedef aluop_opcode_T ALU operation immediate datatype*/
+typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
+
+/* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
+typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
+
+#endif // VTA_TYPEDEFS_H_
--- a/vta/include/vta_params.h
+++ b/vta/include/vta_params.h
--- a/vta/include/vta_pynq_driver.h
+++ b/vta/include/vta_pynq_driver.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_pynq_driver.h
+ * \brief VTA driver for Pynq board.
+ */
+
+#ifndef VTA_PYNQ_DRIVER_H_
+#define VTA_PYNQ_DRIVER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <assert.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#ifdef __arm__
+#include "libxlnk_cma.h"
+#else
+void* cma_alloc(size_t size, int cached);
+void cma_free(void* buf);
+uint32_t cma_get_phy_addr(void* buf);
+void xlnkFlushCache(void* buf, int size);
+void xlnkInvalidateCache(void* buf, int size);
+#endif
+
+/*! \brief VTA command handle */
+typedef void * VTAHandle;
+
+/*! \brief DMA command handle */
+typedef struct {
+  /*! \brief Register map to the AXI DMA control registers*/
+  void *dma_register_map;
+  /*! \brief Transmit data descriptor*/
+  void *mm2s_descriptor_register_map;
+  /*! \brief Receive data descriptor*/
+  void *s2mm_descriptor_register_map;
+  /*! \brief Transmit data descriptor physical address*/
+  uint32_t mm2s_descriptor_phy;
+  /*! \brief Receive data descriptor physical address*/
+  uint32_t s2mm_descriptor_phy;
+  /*! \brief Descriptor size */
+  uint32_t descriptor_size;
+  /*! \brief Transaction count for tx channel */
+  uint32_t mm2s_count;
+  /*! \brief Transaction count for rx channel */
+  uint32_t s2mm_count;
+  /*! \brief Multi-channel mode enable */
+  int multichannel_en;
+} DMAHandle;
+
+/*! \brief partial bitstream status file path */
+#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
+/*! \brief bitstream destination file path */
+#define BS_XDEVCFG "/dev/xdevcfg"
+
+/*! \brief Path to /dev/mem */
+#define DEV_MEM_PATH "/dev/mem"
+/*! \brief MMIO driver constant */
+#define MMIO_WORD_LENGTH 4
+/*! \brief MMIO driver constant */
+#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
+
+/*! \brief VTA configuration register address range */
+#define VTA_RANGE 0x100
+/*! \brief VTA configuration register start value */
+#define VTA_START 0x1
+/*! \brief VTA configuration register auto-restart value */
+#define VTA_AUTORESTART 0x81
+/*! \brief VTA configuration register done value */
+#define VTA_DONE 0x1
+
+/*! \brief VTA fetch stage configuration register address
+*   from auto-generated XPAR_FETCH_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_FETCH_ADDR    0x43C00000
+/*! \brief VTA compute stage configuration register address
+*   from auto-generated XPAR_COMPUTE_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_COMPUTE_ADDR  0x43C10000
+/*! \brief VTA compute stage configuration register address
+*   from auto-generated XPAR_LOAD_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_LOAD_ADDR     0x43C20000
+/*! \brief VTA store stage configuration register address
+*   from auto-generated XPAR_STORE_0_S_AXI_CONTROL_BUS_BASEADDR define
+*   in xparameters.h (under build/vivado/<design name>/export/bsp/ps7_cortexa9_0/include)
+*/
+#define VTA_STORE_ADDR    0x43C30000
+
+/*! \brief Memory management constants with libxlnk_cma */
+#define CACHED 1
+/*! \brief Memory management constants with libxlnk_cma */
+#define NOT_CACHED 0
+
+/*! \brief log2 of SDS buffer size limit */
+#define LOG_MAX_XFER 22
+/*! \brief SDS buffer size limit */
+#define MAX_XFER (1<<LOG_MAX_XFER)
+
+/*!
+ * \brief Returns a memory map to FPGA configuration registers.
+ * \param addr The base physical address of the configuration registers.
+ * \param length The size of the memory mapped region in bytes.
+ * \return A pointer to the memory mapped region.
+ */
+void *MapRegister(unsigned addr, size_t length);
+
+/*!
+ * \brief Deletes the configuration register memory map.
+ * \param vta The memory mapped region.
+ * \param length The size of the memory mapped region in bytes.
+ */
+void UnmapRegister(void *vta, size_t length);
+
+/*!
+ * \brief Writes to a memory mapped configuration register.
+ * \param vta_base The handle to the memory mapped configuration registers.
+ * \param offset The offset of the register to write to.
+ * \param val The value to be written to the memory mapped register.
+ */
+void WriteMappedReg(VTAHandle vta_base, unsigned offset, unsigned val);
+
+/*!
+ * \brief Reads from the memory mapped configuration register.
+ * \param vta_base The handle to the memory mapped configuration registers.
+ * \param offset The offset of the register to read from.
+ * \return The value read from the memory mapped register.
+ */
+unsigned ReadMappedReg(VTAHandle vta_base, unsigned offset);
+
+/*!
+ * \brief Programming the bit stream on the FPGA.
+ * \param bitstream The path to the bit stream file.
+ */
+void ProgramVTA(const char* bitstream);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // VTA_PYNQ_DRIVER_H_
\ No newline at end of file
--- a/vta/include/vta_test_lib.h
+++ b/vta/include/vta_test_lib.h
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_test_lib.cpp
+ * \brief Test library for the VTA design simulation and driver tests.
+ */
+
+#ifndef VTA_TESTLIB_H_
+#define VTA_TESTLIB_H_
+
+#include "vta_params.h"
+
+#include <assert.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifdef NO_SIM
+
+#include "vta_pynq_driver.h"
+
+typedef uint64_t axi_T;
+typedef uint32_t uop_T;
+typedef int8_t wgt_T;
+typedef int8_t inp_T;
+typedef int32_t acc_T;
+
+uint64_t vta (
+  uint32_t insn_count,
+  VTAGenericInsn *insns,
+  VTAUop *uops,
+  inp_T *inputs,
+  wgt_T *weights,
+  acc_T *biases,
+  inp_T *outputs);
+
+#else //NO_SIM
+
+#include "vta.h"
+#include "vta_typedefs.h"
+
+#endif //NO_SIM
+
+/*!
+* \brief Returns opcode string.
+* \param opcode Opcode parameter (defined in vta_defines.h).
+* \param use_imm Boolean that indicates if the operation uses an immediate value.
+* \return The opcode string.
+*/
+const char* getOpcodeString(int opcode, bool use_imm);
+
+/*!
+* \brief Performs buffer data packing and tiling.
+* \param dst Pointer to the packed, and tiled destination 1D array (flattened).
+* \param src Pointer to the unpacked source 2D array.
+* \param y_size Number of rows.
+* \param x_size Number of columns.
+* \param y_block Inner tiling along row dimension.
+* \param x_block Inner tiling along column dimension.
+*/
+template <typename T, int T_WIDTH>
+void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block);
+
+/*!
+* \brief Performs buffer data unpacking.
+* \param dst Pointer to the unpacked destination 2D array.
+* \param src Pointer to the packed, and tiled source 1D array (flattened).
+* \param y_size Number of rows.
+* \param x_size Number of columns.
+* \param y_block Inner tiling along row dimension.
+* \param x_block Inner tiling along column dimension.
+*/
+template <typename T, int T_WIDTH>
+void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block);
+
+/*!
+* \brief Allocates and initializes a 2D array in the heap.
+* \param rows Number of rows.
+* \param cols Number of columns.
+* \return Pointer to the 2D array.
+*/
+template <typename T, int T_WIDTH>
+T ** allocInit2dArray(int rows, int cols);
+
+/*!
+* \brief Allocates a 2D array in the heap.
+* \param rows Number of rows.
+* \param cols Number of columns.
+* \return Pointer to the 2D array.
+*/
+template <typename T>
+T ** alloc2dArray(int rows, int cols);
+
+/*!
+* \brief Frees a 2D array.
+* \param array Pointer to the 2D array to be freed.
+* \param rows Number of rows.
+* \param cols Number of columns.
+*/
+template <typename T>
+void free2dArray(T **array, int rows, int cols);
+
+/*!
+* \brief Allocates a 3D array in the heap.
+* \param rows Number of rows (dim 0).
+* \param cols Number of columns (dim 1).
+* \param depth Depth of the array (dim 2).
+* \return Pointer to the 3D array.
+*/
+template <typename T>
+T *** alloc3dArray(int rows, int cols, int depth);
+
+/*!
+* \brief Frees a 3D array.
+* \param array Pointer to the 3D array.
+* \param rows Number of rows (dim 0).
+* \param cols Number of columns (dim 1).
+* \param depth Depth of the array (dim 2).
+*/
+template <typename T>
+void free3dArray(T *** array, int rows, int cols, int depth);
+
+/*!
+* \brief Performs memory allocation in a physically contiguous region of memory.
+* \param num_bytes Size of the buffer in bytes.
+* \return Pointer to the allocated buffer.
+*/
+void * allocBuffer(size_t num_bytes);
+
+/*!
+* \brief Frees buffer allocated in a physically contiguous region of memory.
+* \param buffer Pointer to the buffer to free.
+*/
+void freeBuffer(void * buffer);
+
+/*!
+* \brief Returns a VTA reset instruction on a 2D patch of the register file.
+* \param type On-chip memory target.
+* \param sram_offset Offset in SRAM.
+* \param y_size Number of rows to reset (y axis).
+* \param x_size Number of elements per row to reset (x axis).
+* \param x_stride Stride along the x axis.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a reset op.
+*/
+VTAGenericInsn reset2DInsn(int type, int sram_offset, int y_size, int x_size, int x_stride,
+  int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
+
+/*!
+* \brief Returns a VTA 2D load or store instruction.
+* \param opcode Type of operation.
+* \param type On-chip memory target.
+* \param sram_offset Offset in SRAM.
+* \param dram_offset Offset in DRAM.
+* \param y_size Number of rows to load/store (y axis).
+* \param x_size Number of elements per row to load/store (x axis).
+* \param x_stride Stride along the x axis.
+* \param y_pad Padding along the y axis.
+* \param x_pad Padding along the x axis.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a 2D load or store op.
+*/
+VTAGenericInsn get2DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset,
+  int y_size, int x_size, int x_stride, int y_pad, int x_pad, int pop_prev_dep, int pop_next_dep,
+  int push_prev_dep, int push_next_dep);
+
+/*!
+* \brief Returns a VTA 1D load or store instruction.
+* \param opcode Type of operation.
+* \param type On-chip memory target.
+* \param sram_offset Offset in SRAM.
+* \param dram_offset Offset in DRAM.
+* \param size Number of elements to load/store.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a 1D load or store op.
+*/
+VTAGenericInsn get1DLoadStoreInsn(int opcode, int type, int sram_offset, int dram_offset, int size,
+  int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
+
+/*!
+* \brief Returns a VTA matrix multiplication instruction of size (a, b) x (b, c).
+* \param uop_offset Offset of the micro-op in SRAM.
+* \param batch Batch size (a).
+* \param in_feat Input features (b).
+* \param out_feat Output features (c).
+* \param uop_compression Apply micro-op compression.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a GEMM op.
+*/
+VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat,
+  bool uop_compression, int pop_prev_dep, int pop_next_dep, int push_prev_dep,
+  int push_next_dep);
+
+/*!
+* \brief Returns a VTA ALU instruction for map type operation.
+* \param opcode Opcode of the ALU instruction.
+* \param use_imm Use immediate.
+* \param imm Immediate value (int16).
+* \param vector_size Vector size of the ALU operation size.
+* \param uop_compression Apply micro-op compression.
+* \param pop_prev_dep Pop dependence from previous stage.
+* \param pop_next_dep Pop dependence from next stage.
+* \param push_prev_dep Push dependence to previous stage.
+* \param push_next_dep Push dependence to next stage.
+* \return A VTAGenericInsn for a ALU op.
+*/
+VTAGenericInsn getALUInsn(int opcode, bool use_imm, int imm, int vector_size, bool uop_compression,
+  int pop_prev_dep, int pop_next_dep, int push_prev_dep, int push_next_dep);
+
+/*!
+* \brief Returns a VTA finish instruction.
+* \param pop_prev Pop dependence from previous stage.
+* \param pop_next Pop dependence from next stage.
+* \return A VTAGenericInsn for a finish op.
+*/
+VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next);
+
+/*!
+* \brief Returns an allocated buffer of VTA micro-ops to implement a copy operation.
+* \param y_size Number of rows to load/store (y axis).
+* \param x_size Number of elements per row to load/store (x axis).
+* \param uop_compression Apply micro-op compression.
+* \return A VTAUop pointer to an allocated micro-op buffer.
+*/
+VTAUop * getCopyUops(int y_size, int x_size, int uop_compression);
+
+/*!
+* \brief Returns an allocated buffer of VTA micro-ops to implement a matrix multiplication
+*   of size (a, b) x (b, c).
+* \param batch Batch size (a).
+* \param in_feat Input features (b).
+* \param out_feat Output features (c).
+* \param uop_compression Apply micro-op compression.
+* \param multi_threaded Generate micro-ops for two virtual execution threads.
+* \return A VTAUop pointer to an allocated micro-op buffer.
+*/
+VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
+  bool multi_threaded);
+
+/*!
+* \brief Returns an allocated buffer of VTA micro-ops to implement a vector-vector map operation.
+* \param vector_size Vector size.
+* \param uop_compression Apply micro-op compression.
+* \return A VTAUop pointer to an allocated micro-op buffer.
+*/
+VTAUop * getMapALUUops(int vector_size, bool uop_compression);
+
+/*!
+* \brief Print out parameters of the VTA design (for debugging purposes).
+*/
+void printParameters();
+
+/*!
+* \brief Print out instruction information (for debugging purposes).
+* \param num_insn Number of instructions.
+* \param insns Pointer to the instruction buffer.
+*/
+void printInstruction(int num_insn, VTAGenericInsn *insns);
+
+/*!
+* \brief Print out micro-op information (for debugging purposes).
+* \param num_insn Number of micro-ops.
+* \param insns Pointer to the micro-op buffer.
+*/
+void printMicroOp(int num_uop, VTAUop *uops);
+
+/*!
+* \brief VTA ALU unit test.
+* \param opcode The ALU opcode.
+* \param use_imm Use immediate.
+* \param batch Batch size.
+* \param vector_size Vector length of the ALU operation.
+* \param uop_compression Apply micro-op compression.
+* \return Number of errors from the test run.
+*/
+int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression);
+
+/*!
+* \brief VTA blocked GEMM unit test.
+* \param batch Batch size.
+* \param channels Channel width.
+* \param block Blocking size.
+* \param uop_compression Apply micro-op compression.
+* \return Number of errors from the test run.
+*/
+int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
+  int virtual_threads);
+
+#endif  // VTA_TESTLIB_H_
\ No newline at end of file
--- a/vta/src/driver/pynq/vta_pynq_driver.c
+++ b/vta/src/driver/pynq/vta_pynq_driver.c
--- a/vta/src/hardware/hls/vta.cc
+++ b/vta/src/hardware/hls/vta.cc
--- a/vta/src/test/vta_test_lib.cc
+++ b/vta/src/test/vta_test_lib.cc
--- a/vta/tests/driver/Makefile
+++ b/vta/tests/driver/Makefile
--- a/vta/tests/driver/driver_test.cc
+++ b/vta/tests/driver/driver_test.cc