diff --git a/vta/Makefile b/vta/Makefile
index 6007ed2eeaefd4e351c1f558cde560898f249c42..74c23d691c66c90353aeadf9a232ab2489b6e5c8 100644
--- a/vta/Makefile
+++ b/vta/Makefile
@@ -76,7 +76,7 @@ lib/libvta.$(SHARED_LIBRARY_SUFFIX): $(VTA_LIB_OBJ)
 lint: pylint cpplint
 
 cpplint:
-	python nnvm/dmlc-core/scripts/lint.py vta cpp include src
+	python nnvm/dmlc-core/scripts/lint.py vta cpp include src hardware tests
 
 pylint:
 	pylint python/vta --rcfile=$(ROOTDIR)/tests/lint/pylintrc
diff --git a/vta/hardware/vivado/Makefile b/vta/hardware/vivado/Makefile
index dfcb06316e4de77cfc26db88206e901046bcac08..f3d779ee2a730789fd499272bfd53b815f482b12 100644
--- a/vta/hardware/vivado/Makefile
+++ b/vta/hardware/vivado/Makefile
@@ -1,6 +1,6 @@
 # Directories
 ROOTDIR = $(CURDIR)
-BUILD_DIR = $(ROOTDIR)/build
+BUILD_DIR = $(ROOTDIR)/../../build/hardware/vivado
 SCRIPT_DIR = $(ROOTDIR)/scripts
 SRC_DIR = $(ROOTDIR)/src
 SIM_DIR = $(ROOTDIR)/sim
@@ -27,20 +27,21 @@ include $(config)
 #--------------------
 
 #  Number of threads during compilation
-NUM_THREADS = 8
+VTA_HW_COMP_THREADS = 8
 
 #  Target Frequency
-CLOCK_FREQ = 100
+VTA_HW_COMP_CLOCK_FREQ = 100
 
 #  Timing closure compensation (0 for none, 3 for highest)
-TIMING_CLOSURE_COMP = 0
+VTA_HW_COMP_TIMING_COMP = 0
 
 # Derive clock target period
-TARGET_PER = $(shell echo "$$(( (1000 + $(CLOCK_FREQ) - 1) / $(CLOCK_FREQ) - 0))" )
+TARGET_PER = \
+$(shell echo "$$(( (1000 + $(VTA_HW_COMP_CLOCK_FREQ) - 1) / $(VTA_HW_COMP_CLOCK_FREQ) - $(VTA_HW_COMP_TIMING_COMP)))" )
 
 # Derive config name
 CONF = \
-	$(BATCH)x$(IN_BLOCK)x$(OUT_BLOCK)_$(INP_WIDTH)bx$(WGT_WIDTH)b_$(CLOCK_FREQ)MHz_$(TARGET_PER)ns
+$(VTA_BATCH)x$(VTA_IN_BLOCK)x$(VTA_OUT_BLOCK)_$(VTA_INP_WIDTH)bx$(VTA_WGT_WIDTH)b_$(VTA_HW_COMP_CLOCK_FREQ)MHz_$(TARGET_PER)ns
 IP_BUILD_PATH = $(BUILD_DIR)/hls/$(CONF)
 HW_BUILD_PATH = $(BUILD_DIR)/vivado/$(CONF)
 
@@ -53,23 +54,23 @@ ip:
 	cd $(IP_BUILD_PATH) && \
 		$(VIVADO_HLS) -f $(SCRIPT_DIR)/hls.tcl \
 			-tclargs $(SRC_DIR) $(SIM_DIR) $(TEST_DIR) $(INCLUDE_DIR) $(TARGET_PER) \
-			$(LOG_INP_WIDTH) $(LOG_WGT_WIDTH) $(LOG_ACC_WIDTH) $(LOG_OUT_WIDTH) \
-			$(LOG_BATCH) $(LOG_BLOCK_OUT) $(LOG_BLOCK_IN) \
-			$(LOG_UOP_BUFF_SIZE) $(LOG_INP_BUFF_SIZE) $(LOG_WGT_BUFF_SIZE) \
-			$(LOG_ACC_BUFF_SIZE) $(LOG_OUT_BUFF_SIZE)
+			$(VTA_LOG_INP_WIDTH) $(VTA_LOG_WGT_WIDTH) $(VTA_LOG_ACC_WIDTH) $(VTA_LOG_OUT_WIDTH) \
+			$(VTA_LOG_BATCH) $(VTA_LOG_BLOCK_OUT) $(VTA_LOG_BLOCK_IN) \
+			$(VTA_LOG_UOP_BUFF_SIZE) $(VTA_LOG_INP_BUFF_SIZE) $(VTA_LOG_WGT_BUFF_SIZE) \
+			$(VTA_LOG_ACC_BUFF_SIZE) $(VTA_LOG_OUT_BUFF_SIZE)
 
 bit: ip
 	mkdir -p $(HW_BUILD_PATH)
 	cd $(HW_BUILD_PATH) && \
 		$(VIVADO) -mode tcl -source $(SCRIPT_DIR)/vivado.tcl \
-		-tclargs $(IP_BUILD_PATH) $(NUM_THREADS) $(CLOCK_FREQ) \
-		$(INP_WIDTH) $(WGT_WIDTH) $(OUT_WIDTH) \
-		$(BATCH) $(IN_BLOCK) $(OUT_BLOCK) \
-		$(INP_BUFF_SIZE) $(WGT_BUFF_SIZE) $(OUT_BUFF_SIZE)
+		-tclargs $(IP_BUILD_PATH) $(VTA_HW_COMP_THREADS) $(VTA_HW_COMP_CLOCK_FREQ) \
+		$(VTA_INP_WIDTH) $(VTA_WGT_WIDTH) $(OUT_WIDTH) \
+		$(VTA_BATCH) $(VTA_IN_BLOCK) $(VTA_OUT_BLOCK) \
+		$(VTA_INP_BUFF_SIZE) $(VTA_WGT_BUFF_SIZE) $(VTA_OUT_BUFF_SIZE)
 
 driver: bit
 	cd $(HW_BUILD_PATH) && $(HSI) -mode tcl -source $(SCRIPT_DIR)/hsi.tcl -nojournal -nolog
 	cd $(HW_BUILD_PATH)/bsp && make
 
 clean:
-	rm -rf build
\ No newline at end of file
+	rm -rf $(BUILD_DIR)
\ No newline at end of file
diff --git a/vta/hardware/vivado/scripts/hls.tcl b/vta/hardware/vivado/scripts/hls.tcl
index 220c8f3ba3bfd8e3aa4d28c6a97f4024c4c515be..67ce742bf47ac575195e475ddd021d2b041bbe4f 100644
--- a/vta/hardware/vivado/scripts/hls.tcl
+++ b/vta/hardware/vivado/scripts/hls.tcl
@@ -63,12 +63,12 @@ if { [llength $argv] eq 19 } {
 
 # C define flags to pass to compiler
 set cflags "-I $include_dir -I $src_dir -I $test_dir \
-	-DDEBUG=0 -DLOG_WGT_WIDTH=$wgt_width -DLOG_INP_WIDTH=$inp_width \
-	-DLOG_ACC_WIDTH=$acc_width -DLOG_OUT_WIDTH=$out_width \
-	-DLOG_BATCH=$batch -DLOG_BLOCK_OUT=$block_out -DLOG_BLOCK_IN=$block_in \
-	-DLOG_UOP_BUFF_SIZE=$uop_buff_size -DLOG_INP_BUFF_SIZE=$inp_buff_size \
-	-DLOG_WGT_BUFF_SIZE=$wgt_buff_size -DLOG_ACC_BUFF_SIZE=$acc_buff_size \
-	-DLOG_OUT_BUFF_SIZE=$out_buff_size"
+	-DVTA_DEBUG=0 -DVTA_LOG_WGT_WIDTH=$wgt_width -DVTA_LOG_INP_WIDTH=$inp_width \
+	-DVTA_LOG_ACC_WIDTH=$acc_width -DVTA_LOG_OUT_WIDTH=$out_width \
+	-DVTA_LOG_BATCH=$batch -DVTA_LOG_BLOCK_OUT=$block_out -DVTA_LOG_BLOCK_IN=$block_in \
+	-DVTA_LOG_UOP_BUFF_SIZE=$uop_buff_size -DVTA_LOG_INP_BUFF_SIZE=$inp_buff_size \
+	-DVTA_LOG_WGT_BUFF_SIZE=$wgt_buff_size -DVTA_LOG_ACC_BUFF_SIZE=$acc_buff_size \
+	-DVTA_LOG_OUT_BUFF_SIZE=$out_buff_size"
 
 # Initializes the HLS design and sets HLS pragmas for memory partitioning.
 # This is necessary because of a Vivado restriction that doesn't allow for
diff --git a/vta/hardware/vivado/sim/vta_test.cc b/vta/hardware/vivado/sim/vta_test.cc
index 2031186f31ce62d0e9a6062fa338a5eda6f3f636..16f37a866464365f7e38ef905da101d3332d0e04 100644
--- a/vta/hardware/vivado/sim/vta_test.cc
+++ b/vta/hardware/vivado/sim/vta_test.cc
@@ -11,52 +11,49 @@
 #include "../src/vta.h"
 #include "../../../tests/hardware/common/test_lib.h"
 
-int main(void)
-{
-
-#if DEBUG==1
+int main(void) {
+#if DEBUG == 1
     printParameters();
 #endif
 
     // Buffer indexing
-    assert(LOG_ACC_BUFF_DEPTH>=LOG_INP_BUFF_DEPTH);
+    assert(VTA_LOG_ACC_BUFF_DEPTH >= VTA_LOG_INP_BUFF_DEPTH);
     // Micro op bound
-    assert(UOP_GEM_3_1<UOP_WIDTH);
-    assert(UOP_ALU_3_1<UOP_WIDTH);
+    assert(VTA_UOP_GEM_3_1 < VTA_UOP_WIDTH);
+    assert(VTA_UOP_ALU_3_1 < VTA_UOP_WIDTH);
     // Instruction alignment checks
-    assert(INSN_MEM_7_1<INSN_MEM_8_0);
-    assert(INSN_GEM_8_1<INSN_GEM_9_0);
+    assert(VTA_INSN_MEM_7_1 < VTA_INSN_MEM_8_0);
+    assert(VTA_INSN_GEM_8_1 < VTA_INSN_GEM_9_0);
     // Instruction bounds
-    assert(INSN_MEM_E_1<INS_WIDTH);
-    assert(INSN_GEM_E_1<INS_WIDTH);
-    assert(INSN_ALU_F_1<INS_WIDTH);
+    assert(VTA_INSN_MEM_E_1 < VTA_INS_WIDTH);
+    assert(VTA_INSN_GEM_E_1 < VTA_INS_WIDTH);
+    assert(VTA_INSN_ALU_F_1 < VTA_INS_WIDTH);
 
     int status = 0;
 
     // Run ALU test (vector-scalar operators)
-    status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, true);
-    status |= alu_test(ALU_OPCODE_MIN, true, 16, 128, false);
-    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
-    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
-    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
-    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
-    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
-    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_MIN, true, 16, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MIN, true, 16, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, false);
 
     // Run ALU test (vector-vector operators)
-    status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, true);
-    status |= alu_test(ALU_OPCODE_MIN, false, 16, 128, false);
-    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
-    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
-    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
-    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_MIN, false, 16, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MIN, false, 16, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, false);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, true);
+    status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, false);
 
     // Run blocked GEMM test
-    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
-    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
-    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
-    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
+    status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
 
     return status;
-
-}
\ No newline at end of file
+}
diff --git a/vta/hardware/vivado/src/vta.cc b/vta/hardware/vivado/src/vta.cc
index 7a0f9ded25eeb843b7fa3ba324b24d7830b06320..f628b749d23c6240b3021bc1d02d759a3bda7df6 100644
--- a/vta/hardware/vivado/src/vta.cc
+++ b/vta/hardware/vivado/src/vta.cc
@@ -10,80 +10,78 @@
 
 #include "./vta.h"
 
-void fetch (
+void fetch(
   uint32_t insn_count,
   volatile insn_T *insns,
-  hls::stream<insn_T> &load_queue,
-  hls::stream<insn_T> &gemm_queue,
-  hls::stream<insn_T> &store_queue) {
-#pragma HLS INTERFACE s_axilite port=insn_count bundle=CONTROL_BUS
-#pragma HLS INTERFACE m_axi port=insns offset=slave bundle=ins_port
-#pragma HLS INTERFACE axis port=load_queue
-#pragma HLS INTERFACE axis port=gemm_queue
-#pragma HLS INTERFACE axis port=store_queue
-#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
-
-  INSN_DECODE: for (int pc = 0; pc < insn_count; pc ++) {
-#pragma HLS PIPELINE II=1
+  hls::stream<insn_T> *load_queue,
+  hls::stream<insn_T> *gemm_queue,
+  hls::stream<insn_T> *store_queue) {
+#pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
+#pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
+#pragma HLS INTERFACE axis port = load_queue
+#pragma HLS INTERFACE axis port = gemm_queue
+#pragma HLS INTERFACE axis port = store_queue
+#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
+
+  INSN_DECODE: for (int pc = 0; pc < insn_count; pc++) {
+#pragma HLS PIPELINE II = 1
     // Read instruction fields
     insn_T insn = insns[pc];
     // Do some partial decoding
-    opcode_T opcode = insn.range(INSN_MEM_0_1, INSN_MEM_0_0);
-    memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0);
+    opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0);
+    memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
     // Push to appropriate instruction queue
-    if (opcode == OPCODE_STORE) {
-      store_queue.write(insn);
-    } else if (opcode == OPCODE_LOAD &&
-               (memory_type == MEM_ID_INP || memory_type == MEM_ID_WGT)) {
-      load_queue.write(insn);
+    if (opcode == VTA_OPCODE_STORE) {
+      store_queue->write(insn);
+    } else if (opcode == VTA_OPCODE_LOAD &&
+          (memory_type == VTA_MEM_ID_INP || memory_type == VTA_MEM_ID_WGT)) {
+      load_queue->write(insn);
     } else {
-      gemm_queue.write(insn);
+      gemm_queue->write(insn);
     }
   }
-
 }
 
-void load (
+void load(
   volatile inp_vec_T *inputs,
   volatile wgt_vec_T *weights,
-  hls::stream<insn_T> &load_queue,
-  hls::stream<bool> &g2l_dep_queue,
-  hls::stream<bool> &l2g_dep_queue,
-  inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
-  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT]
+  hls::stream<insn_T> *load_queue,
+  hls::stream<bool> *g2l_dep_queue,
+  hls::stream<bool> *l2g_dep_queue,
+  inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
+  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]
   ) {
-#pragma HLS INTERFACE m_axi port=weights offset=slave bundle=data_port
-#pragma HLS INTERFACE m_axi port=inputs offset=slave bundle=data_port
-#pragma HLS INTERFACE axis port=load_queue
-#pragma HLS INTERFACE axis port=g2l_dep_queue
-#pragma HLS INTERFACE axis port=l2g_dep_queue
-#pragma HLS INTERFACE bram port=wgt_mem
-#pragma HLS INTERFACE bram port=inp_mem
-#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
-// #pragma HLS ARRAY_PARTITION variable=inp_mem complete dim=2
+#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
+#pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port
+#pragma HLS INTERFACE axis port = load_queue
+#pragma HLS INTERFACE axis port = g2l_dep_queue
+#pragma HLS INTERFACE axis port = l2g_dep_queue
+#pragma HLS INTERFACE bram port = wgt_mem
+#pragma HLS INTERFACE bram port = inp_mem
+#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
 
   // Pop load instruction
-  insn_T insn = load_queue.read();
+  insn_T insn = load_queue->read();
 
   // Decode instruction
-  bool pop_prev_dependence = insn[INSN_MEM_1];
-  bool pop_next_dependence = insn[INSN_MEM_2];
-  bool push_prev_dependence = insn[INSN_MEM_3];
-  bool push_next_dependence = insn[INSN_MEM_4];
-  memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0);
-  memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0);
-  memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0);
-  memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0);
-  memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0);
-  memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0);
-  memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0);
-  memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0);
-  memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0);
-  memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0);
+  bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
+  bool pop_next_dependence = insn[VTA_INSN_MEM_2];
+  bool push_prev_dependence = insn[VTA_INSN_MEM_3];
+  bool push_next_dependence = insn[VTA_INSN_MEM_4];
+  memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
+  memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
+  memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
+  memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
+  memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
+  memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
+  memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
+  memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
+  memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
+  memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
 
   // Pop dependence token if instructed
   if (pop_next_dependence) {
-    g2l_dep_queue.read();
+    g2l_dep_queue->read();
   }
 
   // Initialize indices
@@ -94,29 +92,26 @@ void load (
   memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
   memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
   memop_sram_T y_offset = x_size_total * y_pad_0;
-#pragma HLS RESOURCE variable=y_offset core=Mul_LUT
+// Force this computation to be done with LUTs to avoid using too many DSPs
+#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
 
   // Skip padding along y dimension
   sram_idx += y_offset;
 
   // Perform data transfer from DRAM
-  for (int y = 0; y < y_size; y ++) {
+  for (int y = 0; y < y_size; y++) {
 #pragma HLS PIPELINE rewind
     // Skip padding along x dimension
     sram_idx += x_pad_0;
     // Perform data transfer
-    if (memory_type == MEM_ID_INP) {
-      memcpy(
-        &inp_mem[sram_idx][0],
-        (const inp_vec_T*) &inputs[dram_idx * BATCH],
-        x_size * INP_ELEM_BYTES
-      );
+    if (memory_type == VTA_MEM_ID_INP) {
+      memcpy(&inp_mem[sram_idx][0],
+             (const inp_vec_T*) &inputs[dram_idx * VTA_BATCH],
+             x_size * VTA_INP_ELEM_BYTES);
     } else {
-      memcpy(
-        &wgt_mem[sram_idx][0],
-        (const wgt_vec_T*) &weights[dram_idx * BLOCK_OUT],
-        x_size * WGT_ELEM_BYTES
-      );
+      memcpy(&wgt_mem[sram_idx][0],
+             (const wgt_vec_T*) &weights[dram_idx * VTA_BLOCK_OUT],
+             x_size * VTA_WGT_ELEM_BYTES);
     }
     sram_idx += x_size;
     dram_idx += x_stride;
@@ -127,136 +122,130 @@ void load (
   // Reset SRAM index
   sram_idx = sram_base;
   // Pad x/y edges with zeros
-  for (int y = 0; y < y_size_total; y ++) {
+  for (int y = 0; y < y_size_total; y++) {
     if (y < y_pad_0 || y >= y_pad_0 + y_size) {
-      for (int x = 0; x < x_size_total; x ++) {
-#pragma HLS PIPELINE II=1 rewind
-        if (memory_type == MEM_ID_INP) {
-          for (int i = 0; i < BATCH; i ++) {
+      for (int x = 0; x < x_size_total; x++) {
+#pragma HLS PIPELINE II = 1 rewind
+        if (memory_type == VTA_MEM_ID_INP) {
+          for (int i = 0; i < VTA_BATCH; i++) {
             inp_mem[sram_idx][i] = 0;
           }
         } else {
-          for (int i = 0; i < BLOCK_OUT; i ++) {
+          for (int i = 0; i < VTA_BLOCK_OUT; i++) {
             wgt_mem[sram_idx][i] = 0;
           }
         }
-        sram_idx ++;
+        sram_idx++;
       }
     } else {
-      for (int x = 0; x < x_pad_0; x ++) {
-#pragma HLS PIPELINE II=1 rewind
-        if (memory_type == MEM_ID_INP) {
-          for (int i = 0; i < BATCH; i ++) {
+      for (int x = 0; x < x_pad_0; x++) {
+#pragma HLS PIPELINE II = 1 rewind
+        if (memory_type == VTA_MEM_ID_INP) {
+          for (int i = 0; i < VTA_BATCH; i++) {
             inp_mem[sram_idx][i] = 0;
           }
         } else {
-          for (int i = 0; i < BLOCK_OUT; i ++) {
+          for (int i = 0; i < VTA_BLOCK_OUT; i++) {
             wgt_mem[sram_idx][i] = 0;
           }
         }
-        sram_idx ++;
+        sram_idx++;
       }
       sram_idx += x_size;
-      for (int x = 0; x < x_pad_1; x ++) {
-#pragma HLS PIPELINE II=1 rewind
-        if (memory_type == MEM_ID_INP) {
-          for (int i = 0; i < BATCH; i ++) {
+      for (int x = 0; x < x_pad_1; x++) {
+#pragma HLS PIPELINE II = 1 rewind
+        if (memory_type == VTA_MEM_ID_INP) {
+          for (int i = 0; i < VTA_BATCH; i++) {
             inp_mem[sram_idx][i] = 0;
           }
         } else {
-          for (int i = 0; i < BLOCK_OUT; i ++) {
+          for (int i = 0; i < VTA_BLOCK_OUT; i++) {
             wgt_mem[sram_idx][i] = 0;
           }
         }
-        sram_idx ++;
+        sram_idx++;
       }
-
     }
   }
 
   // Push dependence token if instructed
   if (push_next_dependence) {
-    l2g_dep_queue.write(1);
+    l2g_dep_queue->write(1);
   }
 }
 
-void compute (
-  volatile uint32_t &done,
+void compute(
+  volatile uint32_t *done,
   volatile uop_T *uops,
   volatile acc_vec_T *biases,
-  hls::stream<insn_T> &gemm_queue,
-  hls::stream<bool> &l2g_dep_queue,
-  hls::stream<bool> &s2g_dep_queue,
-  hls::stream<bool> &g2l_dep_queue,
-  hls::stream<bool> &g2s_dep_queue,
-  out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
-  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT],
-  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
+  hls::stream<insn_T> *gemm_queue,
+  hls::stream<bool> *l2g_dep_queue,
+  hls::stream<bool> *s2g_dep_queue,
+  hls::stream<bool> *g2l_dep_queue,
+  hls::stream<bool> *g2s_dep_queue,
+  out_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
+  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT],
+  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]
   ) {
-#pragma HLS INTERFACE s_axilite port=done bundle=CONTROL_BUS
-#pragma HLS INTERFACE m_axi port=uops offset=slave bundle=uop_port
-#pragma HLS INTERFACE m_axi port=biases offset=slave bundle=data_port
-#pragma HLS INTERFACE axis port=gemm_queue
-#pragma HLS INTERFACE axis port=l2g_dep_queue
-#pragma HLS INTERFACE axis port=s2g_dep_queue
-#pragma HLS INTERFACE axis port=g2l_dep_queue
-#pragma HLS INTERFACE axis port=g2s_dep_queue
-#pragma HLS INTERFACE bram port=inp_mem
-#pragma HLS INTERFACE bram port=wgt_mem
-#pragma HLS INTERFACE bram port=out_mem
-#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
-// #pragma HLS ARRAY_PARTITION variable=inp_mem complete dim=2
-// #pragma HLS ARRAY_PARTITION variable=out_mem complete dim=2
+#pragma HLS INTERFACE s_axilite port = done bundle = CONTROL_BUS
+#pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
+#pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
+#pragma HLS INTERFACE axis port = gemm_queue
+#pragma HLS INTERFACE axis port = l2g_dep_queue
+#pragma HLS INTERFACE axis port = s2g_dep_queue
+#pragma HLS INTERFACE axis port = g2l_dep_queue
+#pragma HLS INTERFACE axis port = g2s_dep_queue
+#pragma HLS INTERFACE bram port = inp_mem
+#pragma HLS INTERFACE bram port = wgt_mem
+#pragma HLS INTERFACE bram port = out_mem
+#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
 // This is necessary connect the SRAM to the load module
-#pragma HLS RESOURCE variable=wgt_mem core=RAM_1P
+#pragma HLS RESOURCE variable = wgt_mem core = RAM_1P
 
   // Micro-op storage
-  static uop_T uop_mem[UOP_BUFF_DEPTH];
+  static uop_T uop_mem[VTA_UOP_BUFF_DEPTH];
 
   // Accumulator storage
-  static acc_vec_T acc_mem[ACC_BUFF_DEPTH][BATCH];
-#pragma HLS ARRAY_PARTITION variable=acc_mem complete dim=2
+  static acc_vec_T acc_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH];
+#pragma HLS ARRAY_PARTITION variable = acc_mem complete dim = 2
 
   // Pop GEMM instruction
-  insn_T insn = gemm_queue.read();
+  insn_T insn = gemm_queue->read();
 
   // Decode
-  opcode_T opcode = insn.range(INSN_MEM_0_1, INSN_MEM_0_0);
-  bool pop_prev_dependence = insn[INSN_MEM_1];
-  bool pop_next_dependence = insn[INSN_MEM_2];
-  bool push_prev_dependence = insn[INSN_MEM_3];
-  bool push_next_dependence = insn[INSN_MEM_4];
+  opcode_T opcode = insn.range(VTA_INSN_MEM_0_1, VTA_INSN_MEM_0_0);
+  bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
+  bool pop_next_dependence = insn[VTA_INSN_MEM_2];
+  bool push_prev_dependence = insn[VTA_INSN_MEM_3];
+  bool push_next_dependence = insn[VTA_INSN_MEM_4];
 
   // Pop dependence token if instructed
   if (pop_prev_dependence) {
-    l2g_dep_queue.read();
+    l2g_dep_queue->read();
   }
   if (pop_next_dependence) {
-    s2g_dep_queue.read();
+    s2g_dep_queue->read();
   }
 
   // Perform action based on opcode
-  if (opcode == OPCODE_FINISH) {
-
+  if (opcode == VTA_OPCODE_FINISH) {
     // Set done flag if we reach a FINISH instruction
-    done = 1;
-
-  } else if (opcode == OPCODE_LOAD || opcode == OPCODE_STORE) {
-
+    *done = 1;
+  } else if (opcode == VTA_OPCODE_LOAD || opcode == VTA_OPCODE_STORE) {
     // Set done value
-    done = 0;
+    *done = 0;
 
     // Decode instruction
-    memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0);
-    memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0);
-    memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0);
-    memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0);
-    memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0);
-    memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0);
-    memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0);
-    memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0);
-    memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0);
-    memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0);
+    memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
+    memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
+    memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
+    memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
+    memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
+    memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
+    memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
+    memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
+    memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
+    memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
 
     // Initialize indices
     memop_sram_T sram_idx = sram_base;
@@ -266,220 +255,202 @@ void compute (
     memop_size_T y_size_total = y_pad_0 + y_size + y_pad_1;
     memop_size_T x_size_total = x_pad_0 + x_size + x_pad_1;
     memop_sram_T y_offset = x_size_total * y_pad_0;
-#pragma HLS RESOURCE variable=y_offset core=Mul_LUT
+// Force this computation to be done with LUTs to avoid using too many DSPs
+#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
 
-    if (memory_type == MEM_ID_UOP) {
+    if (memory_type == VTA_MEM_ID_UOP) {
       // Perform data transfer
-      memcpy(
-        &uop_mem[sram_base],
-        (const uop_T*) &uops[dram_base],
-        x_size * sizeof(uop_T)
-      );
+      memcpy(&uop_mem[sram_base],
+             (const uop_T*) &uops[dram_base],
+             x_size * sizeof(uop_T));
     } else {
       // Skip vertical padding
       sram_idx += y_offset;
       // Perform data transfer from DRAM
-      for (int y = 0; y < y_size; y ++) {
+      for (int y = 0; y < y_size; y++) {
 #pragma HLS PIPELINE rewind
         // Skip padding along x dimension
         sram_idx += x_pad_0;
         // Perform data transfer
-        memcpy(
-            &acc_mem[sram_idx][0],
-            (const acc_vec_T*) &biases[dram_idx * BATCH],
-            x_size*ACC_ELEM_BYTES
-        );
+        memcpy(&acc_mem[sram_idx][0],
+               (const acc_vec_T*) &biases[dram_idx * VTA_BATCH],
+               x_size*VTA_ACC_ELEM_BYTES);
         sram_idx += x_size;
         dram_idx += x_stride;
         // Skip padding along x dimension
         sram_idx += x_pad_1;
       }
     }
-
-  } else if (opcode == OPCODE_GEMM || opcode == OPCODE_ALU) {
-
+  } else if (opcode == VTA_OPCODE_GEMM || opcode == VTA_OPCODE_ALU) {
     // Set done value
-    done = 0;
+    *done = 0;
 
     // Decode
-    uop_idx_T uop_bgn = insn.range(INSN_GEM_5_1, INSN_GEM_5_0);
-    uop_idx_T uop_end = insn.range(INSN_GEM_6_1, INSN_GEM_6_0);
-    loop_T iter_out  = insn.range(INSN_GEM_7_1, INSN_GEM_7_0);
-    loop_T iter_in  = insn.range(INSN_GEM_8_1, INSN_GEM_8_0);
-    acc_idx_T dst_factor_out = insn.range(INSN_GEM_9_1, INSN_GEM_9_0);
-    acc_idx_T dst_factor_in = insn.range(INSN_GEM_A_1, INSN_GEM_A_0);
-    inp_idx_T src_factor_out = insn.range(INSN_GEM_B_1, INSN_GEM_B_0);
-    inp_idx_T src_factor_in = insn.range(INSN_GEM_C_1, INSN_GEM_C_0);
+    uop_idx_T uop_bgn = insn.range(VTA_INSN_GEM_5_1, VTA_INSN_GEM_5_0);
+    uop_idx_T uop_end = insn.range(VTA_INSN_GEM_6_1, VTA_INSN_GEM_6_0);
+    loop_T iter_out  = insn.range(VTA_INSN_GEM_7_1, VTA_INSN_GEM_7_0);
+    loop_T iter_in  = insn.range(VTA_INSN_GEM_8_1, VTA_INSN_GEM_8_0);
+    acc_idx_T dst_factor_out = insn.range(VTA_INSN_GEM_9_1, VTA_INSN_GEM_9_0);
+    acc_idx_T dst_factor_in = insn.range(VTA_INSN_GEM_A_1, VTA_INSN_GEM_A_0);
+    inp_idx_T src_factor_out = insn.range(VTA_INSN_GEM_B_1, VTA_INSN_GEM_B_0);
+    inp_idx_T src_factor_in = insn.range(VTA_INSN_GEM_C_1, VTA_INSN_GEM_C_0);
 
     // GEMM-specific fields
-    wgt_idx_T wgt_factor_out = insn.range(INSN_GEM_D_1, INSN_GEM_D_0);
-    wgt_idx_T wgt_factor_in = insn.range(INSN_GEM_E_1, INSN_GEM_E_0);
+    wgt_idx_T wgt_factor_out = insn.range(VTA_INSN_GEM_D_1, VTA_INSN_GEM_D_0);
+    wgt_idx_T wgt_factor_in = insn.range(VTA_INSN_GEM_E_1, VTA_INSN_GEM_E_0);
 
     // ALU-specific field
-    aluop_opcode_T alu_opcode = insn.range(INSN_ALU_D_1, INSN_ALU_D_0);
-    bool use_imm = insn[INSN_ALU_E];
-    aluop_imm_T imm = insn.range(INSN_ALU_F_1, INSN_ALU_F_0);
-
+    aluop_opcode_T alu_opcode = insn.range(VTA_INSN_ALU_D_1, VTA_INSN_ALU_D_0);
+    bool use_imm = insn[VTA_INSN_ALU_E];
+    aluop_imm_T imm = insn.range(VTA_INSN_ALU_F_1, VTA_INSN_ALU_F_0);
     acc_idx_T dst_offset_out = 0;
     inp_idx_T src_offset_out = 0;
     wgt_idx_T wgt_offset_out = 0;
 
     // Outer Loop
-    EXE_OUT_LOOP: for (int it_out = 0; it_out < iter_out; it_out ++) {
-#pragma HLS DEPENDENCE variable=acc_mem inter false
-
+    EXE_OUT_LOOP: for (int it_out = 0; it_out < iter_out; it_out++) {
+#pragma HLS DEPENDENCE variable = acc_mem inter false
       acc_idx_T dst_offset_in = dst_offset_out;
       inp_idx_T src_offset_in = src_offset_out;
       wgt_idx_T wgt_offset_in = wgt_offset_out;
 
       // Inner Loop
-      EXE_IN_LOOP: for (int it_in = 0; it_in < iter_in; it_in ++) {
-
+      EXE_IN_LOOP: for (int it_in = 0; it_in < iter_in; it_in++) {
         // Perform appropriate computation based on opcode
-        if (opcode == OPCODE_GEMM) {
-
+        if (opcode == VTA_OPCODE_GEMM) {
           // Iterate over micro op
-          READ_GEMM_UOP: for (int upc = uop_bgn; upc < uop_end; upc ++) {
-#pragma HLS PIPELINE II=1 rewind
+          READ_GEMM_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) {
+#pragma HLS PIPELINE II = 1 rewind
 
             // Read micro-op fields
             uop_T uop = uop_mem[upc];
 
             // Decode indices
-            bool reset_out = uop[UOP_GEM_0];
+            bool reset_out = uop[VTA_UOP_GEM_0];
             acc_idx_T dst_idx =
-              uop.range(UOP_GEM_1_1, UOP_GEM_1_0) + dst_offset_in;
+                uop.range(VTA_UOP_GEM_1_1, VTA_UOP_GEM_1_0) + dst_offset_in;
             acc_idx_T src_idx =
-              uop.range(UOP_GEM_2_1, UOP_GEM_2_0) + src_offset_in;
+                uop.range(VTA_UOP_GEM_2_1, VTA_UOP_GEM_2_0) + src_offset_in;
             wgt_idx_T wgt_idx =
-              uop.range(UOP_GEM_3_1, UOP_GEM_3_0) + wgt_offset_in;
+                uop.range(VTA_UOP_GEM_3_1, VTA_UOP_GEM_3_0) + wgt_offset_in;
 
             // Read weight matrix
-            wgt_vec_T w_matrix[BLOCK_OUT];
-            for (int i = 0; i < BLOCK_OUT; i ++) {
+            wgt_vec_T w_matrix[VTA_BLOCK_OUT];
+            for (int i = 0; i < VTA_BLOCK_OUT; i++) {
               w_matrix[i] = wgt_mem[wgt_idx][i];
             }
             // Read input matrix and accum matrix
-            acc_vec_T o_matrix[BATCH];
-            out_vec_T i_matrix[BATCH];
-            for (int i = 0; i < BATCH; i ++) {
+            acc_vec_T o_matrix[VTA_BATCH];
+            out_vec_T i_matrix[VTA_BATCH];
+            for (int i = 0; i < VTA_BATCH; i++) {
               o_matrix[i] = acc_mem[dst_idx][i];
               i_matrix[i] = inp_mem[src_idx][i];
             }
             // Result matrices
-            acc_vec_T acc_mem_val[BATCH];
-            out_vec_T st_buf_val[BATCH];
+            acc_vec_T acc_mem_val[VTA_BATCH];
+            out_vec_T st_buf_val[VTA_BATCH];
 
             // Inner GEMM loop
-            for (int i = 0; i < BATCH; i ++) {
-              for (int b = 0; b < BLOCK_OUT; b ++) {
+            for (int i = 0; i < VTA_BATCH; i++) {
+              for (int b = 0; b < VTA_BLOCK_OUT; b++) {
                 // Initialize the accumulator values
                 acc_T accum =
-                  o_matrix[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH);
+                  o_matrix[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
                 // Dot product sum
                 sum_T tmp = 0;
                 // Inner matrix multiplication loop (input channel/feature)
-                for (int k=0; k<BLOCK_IN; k++) {
+                for (int k = 0; k < VTA_BLOCK_IN; k++) {
                   wgt_T w_elem =
-                    w_matrix[b].range((k + 1) * WGT_WIDTH - 1, k * WGT_WIDTH);
+                      w_matrix[b].range((k + 1) * VTA_WGT_WIDTH - 1, k * VTA_WGT_WIDTH);
                   inp_T i_elem =
-                    i_matrix[i].range((k + 1) * INP_WIDTH - 1, k * INP_WIDTH);
+                      i_matrix[i].range((k + 1) * VTA_INP_WIDTH - 1, k * VTA_INP_WIDTH);
                   mul_T prod = i_elem * w_elem;
                   tmp += (sum_T) prod;
                 }
                 // Update summation
                 accum += (acc_T) tmp;
                 // Update result vector
-                acc_mem_val[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) =
-                  reset_out ? (acc_T) 0 : accum;
-                st_buf_val[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) =
-                  (inp_T) accum.range(INP_WIDTH - 1, 0);
+                acc_mem_val[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) =
+                    reset_out ? (acc_T) 0 : accum;
+                st_buf_val[i].range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
+                    (inp_T) accum.range(VTA_OUT_WIDTH - 1, 0);
               }
               // Write to buffers
               acc_mem[dst_idx][i] = acc_mem_val[i];
               out_mem[dst_idx][i] = st_buf_val[i];
             }
           }
-
-        } else if (opcode == OPCODE_ALU) {
-
+        } else if (opcode == VTA_OPCODE_ALU) {
           // Iterate over micro op
-          READ_ALU_UOP: for (int upc = uop_bgn; upc < uop_end; upc ++) {
-
+          READ_ALU_UOP: for (int upc = uop_bgn; upc < uop_end; upc++) {
             // Read micro-op fields
             uop_T uop = uop_mem[upc];
 
             // Decode
-            bool reset_out = uop[UOP_ALU_0];
+            bool reset_out = uop[VTA_UOP_ALU_0];
             acc_idx_T dst_idx =
-              uop.range(UOP_ALU_1_1, UOP_ALU_1_0) + dst_offset_in;
+                uop.range(VTA_UOP_ALU_1_1, VTA_UOP_ALU_1_0) + dst_offset_in;
             acc_idx_T src_idx =
-              uop.range(UOP_ALU_2_1, UOP_ALU_2_0) + src_offset_in;
+                uop.range(VTA_UOP_ALU_2_1, VTA_UOP_ALU_2_0) + src_offset_in;
 
             // Read input matrix and accum matrix
-            acc_vec_T dst_matrix[BATCH];
-            acc_vec_T src_matrix[BATCH];
-            for (int i = 0; i < BATCH; i ++) {
+            acc_vec_T dst_matrix[VTA_BATCH];
+            acc_vec_T src_matrix[VTA_BATCH];
+            for (int i = 0; i < VTA_BATCH; i++) {
 #pragma HLS UNROLL complete
               dst_matrix[i] = acc_mem[dst_idx][i];
               src_matrix[i] = acc_mem[src_idx][i];
             }
 
             // Result matrices
-            acc_vec_T cmp_res[BATCH];
-            acc_vec_T add_res[BATCH];
-            acc_vec_T shr_res[BATCH];
-            out_vec_T short_cmp_res[BATCH];
-            out_vec_T short_add_res[BATCH];
-            out_vec_T short_shr_res[BATCH];
+            acc_vec_T cmp_res[VTA_BATCH];
+            acc_vec_T add_res[VTA_BATCH];
+            acc_vec_T shr_res[VTA_BATCH];
+            out_vec_T short_cmp_res[VTA_BATCH];
+            out_vec_T short_add_res[VTA_BATCH];
+            out_vec_T short_shr_res[VTA_BATCH];
 
             // Perform ALU op over matrix elements
-            for (int i = 0; i < BATCH; i ++) {
-#pragma HLS PIPELINE II=1 rewind
+            for (int i = 0; i < VTA_BATCH; i++) {
+#pragma HLS PIPELINE II = 1 rewind
               // Results vector
               acc_vec_T res_vec = 0;
-              for (int b = 0; b < BLOCK_OUT; b ++) {
+              for (int b = 0; b < VTA_BLOCK_OUT; b++) {
                 // Read in operands
-                acc_T src_0 =
-                  dst_matrix[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH);
-                acc_T src_1 =
-                  use_imm ?
+                acc_T src_0 = dst_matrix[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
+                acc_T src_1 = use_imm ?
                     (acc_T) imm :
-                    src_matrix[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH);
+                    src_matrix[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH);
                 // Compute Min/Max
-                acc_T mix_val =
-                  src_0 < src_1 ?
-                    (alu_opcode == ALU_OPCODE_MIN ? src_0 : src_1) :
-                    (alu_opcode == ALU_OPCODE_MIN ? src_1 : src_0);
-                cmp_res[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) =
-                  mix_val;
-                short_cmp_res[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) =
-                  (inp_T) mix_val.range(INP_WIDTH - 1, 0);
+                acc_T mix_val = src_0 < src_1 ?
+                    (alu_opcode == VTA_ALU_OPCODE_MIN ? src_0 : src_1) :
+                    (alu_opcode == VTA_ALU_OPCODE_MIN ? src_1 : src_0);
+                cmp_res[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = mix_val;
+                short_cmp_res[i].range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
+                    (inp_T) mix_val.range(VTA_OUT_WIDTH - 1, 0);
                 // Compute Sum
                 acc_T add_val =
-                  src_0.range(ACC_WIDTH - 1, 0) + src_1.range(ACC_WIDTH - 1, 0);
-                add_res[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) =
-                  add_val;
-                short_add_res[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) =
-                  (inp_T) add_val.range(INP_WIDTH - 1, 0);
+                    src_0.range(VTA_ACC_WIDTH - 1, 0) + src_1.range(VTA_ACC_WIDTH - 1, 0);
+                add_res[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = add_val;
+                short_add_res[i].range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
+                    (inp_T) add_val.range(VTA_OUT_WIDTH - 1, 0);
                 // Compute Shift
                 acc_T shr_val =
-                  src_0 >> (aluop_sh_imm_T) src_1.range(LOG_ACC_WIDTH - 1, 0);
-                shr_res[i].range((b + 1) * ACC_WIDTH - 1, b * ACC_WIDTH) =
-                  shr_val;
-                short_shr_res[i].range((b + 1) * INP_WIDTH - 1, b * INP_WIDTH) =
-                  (inp_T) shr_val.range(INP_WIDTH-1, 0);
+                    src_0 >> (aluop_sh_imm_T) src_1.range(VTA_LOG_ACC_WIDTH - 1, 0);
+                shr_res[i].range((b + 1) * VTA_ACC_WIDTH - 1, b * VTA_ACC_WIDTH) = shr_val;
+                short_shr_res[i].range((b + 1) * VTA_OUT_WIDTH - 1, b * VTA_OUT_WIDTH) =
+                    (inp_T) shr_val.range(VTA_OUT_WIDTH-1, 0);
               }
 
               // Store to accum memory/store buffer
-              if (alu_opcode == ALU_OPCODE_MIN ||
-                  alu_opcode == ALU_OPCODE_MAX) {
+              if (alu_opcode == VTA_ALU_OPCODE_MIN ||
+                  alu_opcode == VTA_ALU_OPCODE_MAX) {
                 acc_mem[dst_idx][i] = cmp_res[i];
                 out_mem[dst_idx][i] = short_cmp_res[i];
-              } else if (alu_opcode==ALU_OPCODE_ADD) {
+              } else if (alu_opcode == VTA_ALU_OPCODE_ADD) {
                 acc_mem[dst_idx][i] = add_res[i];
                 out_mem[dst_idx][i] = short_add_res[i];
-              } else if (alu_opcode==ALU_OPCODE_SHR) {
+              } else if (alu_opcode == VTA_ALU_OPCODE_SHR) {
                 acc_mem[dst_idx][i] = shr_res[i];
                 out_mem[dst_idx][i] = short_shr_res[i];
               }
@@ -502,51 +473,49 @@ void compute (
 
   // Push dependence token if instructed
   if (push_prev_dependence) {
-    g2l_dep_queue.write(1);
+    g2l_dep_queue->write(1);
   }
   if (push_next_dependence) {
-    g2s_dep_queue.write(1);
+    g2s_dep_queue->write(1);
   }
-
 }
 
-void store (
+void store(
   volatile out_vec_T *outputs,
-  hls::stream<insn_T> &store_queue,
-  hls::stream<bool> &g2s_dep_queue,
-  hls::stream<bool> &s2g_dep_queue,
-  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
+  hls::stream<insn_T> *store_queue,
+  hls::stream<bool> *g2s_dep_queue,
+  hls::stream<bool> *s2g_dep_queue,
+  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]
   ) {
-#pragma HLS INTERFACE m_axi port=outputs offset=slave bundle=data_port
-#pragma HLS INTERFACE axis port=store_queue
-#pragma HLS INTERFACE axis port=g2s_dep_queue
-#pragma HLS INTERFACE axis port=s2g_dep_queue
-#pragma HLS INTERFACE bram port=out_mem
-#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
-// #pragma HLS ARRAY_PARTITION variable=out_mem complete dim=2
+#pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port
+#pragma HLS INTERFACE axis port = store_queue
+#pragma HLS INTERFACE axis port = g2s_dep_queue
+#pragma HLS INTERFACE axis port = s2g_dep_queue
+#pragma HLS INTERFACE bram port = out_mem
+#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
 
   // Load buffer
-  insn_T insn = store_queue.read();
+  insn_T insn = store_queue->read();
 
   // Decode
-  bool pop_prev_dependence = insn[INSN_MEM_1];
-  bool pop_next_dependence = insn[INSN_MEM_2];
-  bool push_prev_dependence = insn[INSN_MEM_3];
-  bool push_next_dependence = insn[INSN_MEM_4];
-  memop_id_T memory_type = insn.range(INSN_MEM_5_1, INSN_MEM_5_0);
-  memop_sram_T sram_base = insn.range(INSN_MEM_6_1, INSN_MEM_6_0);
-  memop_dram_T dram_base = insn.range(INSN_MEM_7_1, INSN_MEM_7_0);
-  memop_size_T y_size = insn.range(INSN_MEM_8_1, INSN_MEM_8_0);
-  memop_size_T x_size = insn.range(INSN_MEM_9_1, INSN_MEM_9_0);
-  memop_stride_T x_stride = insn.range(INSN_MEM_A_1, INSN_MEM_A_0);
-  memop_pad_T y_pad_0 = insn.range(INSN_MEM_B_1, INSN_MEM_B_0);
-  memop_pad_T y_pad_1 = insn.range(INSN_MEM_C_1, INSN_MEM_C_0);
-  memop_pad_T x_pad_0 = insn.range(INSN_MEM_D_1, INSN_MEM_D_0);
-  memop_pad_T x_pad_1 = insn.range(INSN_MEM_E_1, INSN_MEM_E_0);
+  bool pop_prev_dependence = insn[VTA_INSN_MEM_1];
+  bool pop_next_dependence = insn[VTA_INSN_MEM_2];
+  bool push_prev_dependence = insn[VTA_INSN_MEM_3];
+  bool push_next_dependence = insn[VTA_INSN_MEM_4];
+  memop_id_T memory_type = insn.range(VTA_INSN_MEM_5_1, VTA_INSN_MEM_5_0);
+  memop_sram_T sram_base = insn.range(VTA_INSN_MEM_6_1, VTA_INSN_MEM_6_0);
+  memop_dram_T dram_base = insn.range(VTA_INSN_MEM_7_1, VTA_INSN_MEM_7_0);
+  memop_size_T y_size = insn.range(VTA_INSN_MEM_8_1, VTA_INSN_MEM_8_0);
+  memop_size_T x_size = insn.range(VTA_INSN_MEM_9_1, VTA_INSN_MEM_9_0);
+  memop_stride_T x_stride = insn.range(VTA_INSN_MEM_A_1, VTA_INSN_MEM_A_0);
+  memop_pad_T y_pad_0 = insn.range(VTA_INSN_MEM_B_1, VTA_INSN_MEM_B_0);
+  memop_pad_T y_pad_1 = insn.range(VTA_INSN_MEM_C_1, VTA_INSN_MEM_C_0);
+  memop_pad_T x_pad_0 = insn.range(VTA_INSN_MEM_D_1, VTA_INSN_MEM_D_0);
+  memop_pad_T x_pad_1 = insn.range(VTA_INSN_MEM_E_1, VTA_INSN_MEM_E_0);
 
   // Pop dependence token if instructed
   if (pop_prev_dependence) {
-    g2s_dep_queue.read();
+    g2s_dep_queue->read();
   }
 
   // Initialize indices
@@ -556,18 +525,19 @@ void store (
   // Skip padding along y dimension
   memop_sram_T y_offset = (x_pad_0 + x_size + x_pad_1) * y_pad_0;
   sram_idx += y_offset;
-#pragma HLS RESOURCE variable=y_offset core=Mul_LUT
+// Force this computation to be done with LUTs to avoid using too many DSPs
+#pragma HLS RESOURCE variable = y_offset core = Mul_LUT
 
   // Copy along y dimension
-  for (int y = 0; y < y_size; y ++) {
+  for (int y = 0; y < y_size; y++) {
 #pragma HLS PIPELINE rewind
     // Skip padding along x dimension
     sram_idx += x_pad_0;
     // Perform data transfer
     memcpy(
-      (out_vec_T *) &outputs[dram_idx*BATCH],
+      const_cast<out_vec_T*>(&outputs[dram_idx*VTA_BATCH]),
       (const out_vec_T*) &out_mem[sram_idx][0],
-      x_size * INP_ELEM_BYTES);
+      x_size * VTA_INP_ELEM_BYTES);
     sram_idx += x_size;
     dram_idx += x_stride;
     // Skip padding along x dimension
@@ -576,11 +546,11 @@ void store (
 
   // Push dependence token if instructed
   if (push_prev_dependence) {
-    s2g_dep_queue.write(1);
+    s2g_dep_queue->write(1);
   }
 }
 
-void vta (
+void vta(
   uint32_t insn_count,
   volatile insn_T *insns,
   volatile uop_T *uops,
@@ -588,14 +558,14 @@ void vta (
   volatile wgt_vec_T *weights,
   volatile acc_vec_T *biases,
   volatile out_vec_T *outputs) {
-#pragma HLS INTERFACE s_axilite port=insn_count bundle=CONTROL_BUS
-#pragma HLS INTERFACE m_axi port=insns offset=slave bundle=ins_port
-#pragma HLS INTERFACE m_axi port=uops offset=slave bundle=uop_port
-#pragma HLS INTERFACE m_axi port=inputs offset=slave bundle=data_port
-#pragma HLS INTERFACE m_axi port=weights offset=slave bundle=data_port
-#pragma HLS INTERFACE m_axi port=biases offset=slave bundle=data_port
-#pragma HLS INTERFACE m_axi port=outputs offset=slave bundle=data_port
-#pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS
+#pragma HLS INTERFACE s_axilite port = insn_count bundle = CONTROL_BUS
+#pragma HLS INTERFACE m_axi port = insns offset = slave bundle = ins_port
+#pragma HLS INTERFACE m_axi port = uops offset = slave bundle = uop_port
+#pragma HLS INTERFACE m_axi port = inputs offset = slave bundle = data_port
+#pragma HLS INTERFACE m_axi port = weights offset = slave bundle = data_port
+#pragma HLS INTERFACE m_axi port = biases offset = slave bundle = data_port
+#pragma HLS INTERFACE m_axi port = outputs offset = slave bundle = data_port
+#pragma HLS INTERFACE s_axilite port = return bundle = CONTROL_BUS
 
   // Instantiate temporary instruction queues (used for peeking)
   hls::stream<insn_T> tmp_load_queue;
@@ -614,18 +584,12 @@ void vta (
   hls::stream<bool> g2s_dep_queue;
 
   // Instantiate memories
-  inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH];
-  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT];
-  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH];
+  inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH];
+  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT];
+  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH];
 
   // Push all instructions into the queues
-  fetch(
-    insn_count,
-    insns,
-    tmp_load_queue,
-    tmp_gemm_queue,
-    tmp_store_queue
-  );
+  fetch(insn_count, insns, &tmp_load_queue, &tmp_gemm_queue, &tmp_store_queue);
 
   // Global done indicator
   uint32_t done = 0;
@@ -651,21 +615,13 @@ void vta (
         tmp_load_popped = true;
       }
       // Check dependences and invoke the load stage
-      bool pop_next_dependence = tmp_load[INSN_MEM_2];
+      bool pop_next_dependence = tmp_load[VTA_INSN_MEM_2];
       if ((pop_next_dependence && !g2l_dep_queue.empty()) ||
           !pop_next_dependence) {
         // Push the instruction in the load queue
         load_queue.write(tmp_load);
         tmp_load_popped = false;
-        load(
-          inputs,
-          weights,
-          load_queue,
-          g2l_dep_queue,
-          l2g_dep_queue,
-          inp_mem,
-          wgt_mem
-        );
+        load(inputs, weights, &load_queue, &g2l_dep_queue, &l2g_dep_queue, inp_mem, wgt_mem);
       } else {
         // Execution of load stage pending on completion of other stages, so break here...
         break;
@@ -679,8 +635,8 @@ void vta (
         tmp_gemm_popped = true;
       }
       // Check dependences and invoke the load stage
-      bool pop_prev_dependence = tmp_gemv[INSN_MEM_1];
-      bool pop_next_dependence = tmp_gemv[INSN_MEM_2];
+      bool pop_prev_dependence = tmp_gemv[VTA_INSN_MEM_1];
+      bool pop_next_dependence = tmp_gemv[VTA_INSN_MEM_2];
       if (
         (pop_prev_dependence && !l2g_dep_queue.empty() &&
          pop_next_dependence && !s2g_dep_queue.empty()) ||
@@ -693,19 +649,8 @@ void vta (
         // Push the instruction in the load queue
         gemm_queue.write(tmp_gemv);
         tmp_gemm_popped = false;
-        compute(
-          done,
-          uops,
-          biases,
-          gemm_queue,
-          l2g_dep_queue,
-          s2g_dep_queue,
-          g2l_dep_queue,
-          g2s_dep_queue,
-          inp_mem,
-          wgt_mem,
-          out_mem
-        );
+        compute(&done, uops, biases, &gemm_queue, &l2g_dep_queue, &s2g_dep_queue,
+                &g2l_dep_queue, &g2s_dep_queue, inp_mem, wgt_mem, out_mem);
       } else {
         // Execution of load stage pending on completion of other stages,
         // so break here...
@@ -720,19 +665,13 @@ void vta (
         tmp_store_popped = true;
       }
       // Check dependences and invoke the load stage
-      bool pop_prev_dependence = tmp_store[INSN_MEM_1];
+      bool pop_prev_dependence = tmp_store[VTA_INSN_MEM_1];
       if ((pop_prev_dependence && !g2s_dep_queue.empty()) ||
           !pop_prev_dependence) {
         // Push the instruction in the load queue
         store_queue.write(tmp_store);
         tmp_store_popped = false;
-        store(
-          outputs,
-          store_queue,
-          g2s_dep_queue,
-          s2g_dep_queue,
-          out_mem
-        );
+        store(outputs, &store_queue, &g2s_dep_queue, &s2g_dep_queue, out_mem);
       } else {
         // Execution of load stage pending on completion of other stages, so break here...
         break;
@@ -742,7 +681,7 @@ void vta (
     if (done) {
       break;
     }
-    exit_counter ++;
+    exit_counter++;
     if (exit_counter > 1000) {
       if (tmp_load_popped) {
         if (g2l_dep_queue.empty()) {
@@ -750,10 +689,10 @@ void vta (
         }
       }
       if (tmp_gemm_popped) {
-        if (l2g_dep_queue.empty() && tmp_gemv[INSN_MEM_1]) {
+        if (l2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_1]) {
           printf("waiting on l2g\n");
         }
-        if (s2g_dep_queue.empty() && tmp_gemv[INSN_MEM_2]) {
+        if (s2g_dep_queue.empty() && tmp_gemv[VTA_INSN_MEM_2]) {
           printf("waiting on s2g\n");
         }
       }
@@ -772,17 +711,17 @@ void vta (
   int s2g_count = 0;
   int g2l_count = 0;
   int g2s_count = 0;
-  while(l2g_dep_queue.read_nb(tmp_tok)) {
-    l2g_count ++;
+  while (l2g_dep_queue.read_nb(tmp_tok)) {
+    l2g_count++;
   }
-  while(s2g_dep_queue.read_nb(tmp_tok)) {
-    s2g_count ++;
+  while (s2g_dep_queue.read_nb(tmp_tok)) {
+    s2g_count++;
   }
-  while(g2l_dep_queue.read_nb(tmp_tok)) {
-    g2l_count ++;
+  while (g2l_dep_queue.read_nb(tmp_tok)) {
+    g2l_count++;
   }
-  while(g2s_dep_queue.read_nb(tmp_tok)) {
-    g2s_count ++;
+  while (g2s_dep_queue.read_nb(tmp_tok)) {
+    g2s_count++;
   }
 
   assert(l2g_count == 0 && g2s_count == 0 && g2l_count == 0 && g2s_count == 0);
diff --git a/vta/hardware/vivado/src/vta.h b/vta/hardware/vivado/src/vta.h
index 5dd4d953e4364196a8ff918260a6d19e76e15746..37395722f5f742bc05ad50a4fed20fd6ceeadf53 100644
--- a/vta/hardware/vivado/src/vta.h
+++ b/vta/hardware/vivado/src/vta.h
@@ -3,96 +3,96 @@
  * \file vta.h
  * \brief Type definitions and prototype for VTA HLS design.
  */
-#ifndef VTA_MAIN_H_
-#define VTA_MAIN_H_
+#ifndef VTA_VTA_H_
+#define VTA_VTA_H_
 
-#include <assert.h>
 #include <ap_axi_sdata.h>
 #include <ap_int.h>
+#include <assert.h>
 #include <hls_stream.h>
 
 #include <vta/hw_spec.h>
 
 /* \typedef uop_T Micro-op datatype*/
-typedef ap_uint<UOP_WIDTH> uop_T;
+typedef ap_uint<VTA_UOP_WIDTH> uop_T;
 
 /* \typedef inp_T Input datatype*/
-typedef ap_int<INP_WIDTH> inp_T;
+typedef ap_int<VTA_INP_WIDTH> inp_T;
 
 /* \typedef wgt_T Weight datatype*/
-typedef ap_int<WGT_WIDTH> wgt_T;
+typedef ap_int<VTA_WGT_WIDTH> wgt_T;
 
 /* \typedef out_T Output datatype*/
-typedef ap_int<OUT_WIDTH> out_T;
+typedef ap_int<VTA_OUT_WIDTH> out_T;
 
 /* \typedef acc_T Accumulator datatype*/
-typedef ap_int<ACC_WIDTH> acc_T;
+typedef ap_int<VTA_ACC_WIDTH> acc_T;
 
 /* \typedef mul_T Multiplier output datatype*/
-typedef ap_int<WGT_WIDTH+INP_WIDTH+1> mul_T;
+typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+1> mul_T;
 
 /* \typedef sum_T GEMM accumulator datatype*/
-typedef ap_int<WGT_WIDTH+INP_WIDTH+LOG_BLOCK_IN+1> sum_T;
+typedef ap_int<VTA_WGT_WIDTH+VTA_INP_WIDTH+VTA_LOG_BLOCK_IN+1> sum_T;
 
 /* \typedef inp_vec_T Input vector datatype*/
-typedef ap_uint<INP_WIDTH*BLOCK_IN> inp_vec_T;
+typedef ap_uint<VTA_INP_WIDTH*VTA_BLOCK_IN> inp_vec_T;
 
 /* \typedef wgt_vec_T Weight vector datatype*/
-typedef ap_uint<WGT_WIDTH*BLOCK_IN> wgt_vec_T;
+typedef ap_uint<VTA_WGT_WIDTH*VTA_BLOCK_IN> wgt_vec_T;
 
 /* \typedef acc_vec_T Accumulator vector datatype*/
-typedef ap_uint<ACC_WIDTH*BLOCK_OUT> acc_vec_T;
+typedef ap_uint<VTA_ACC_WIDTH*VTA_BLOCK_OUT> acc_vec_T;
 
 /* \typedef out_vec_T Output vector datatype*/
-typedef ap_uint<OUT_WIDTH*BLOCK_OUT> out_vec_T;
+typedef ap_uint<VTA_OUT_WIDTH*VTA_BLOCK_OUT> out_vec_T;
 
 /* \typedef uop_idx_T Micro-op SRAM index datatype*/
-typedef ap_uint<LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
+typedef ap_uint<VTA_LOG_UOP_BUFF_DEPTH+1> uop_idx_T;
 
 /* \typedef inp_idx_T Input SRAM index datatype*/
-typedef ap_uint<LOG_INP_BUFF_DEPTH+1> inp_idx_T;
+typedef ap_uint<VTA_LOG_INP_BUFF_DEPTH+1> inp_idx_T;
 
 /* \typedef wgt_idx_T Weight SRAM index datatype*/
-typedef ap_uint<LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
+typedef ap_uint<VTA_LOG_WGT_BUFF_DEPTH+1> wgt_idx_T;
 
 /* \typedef acc_idx_T Accumulator SRAM index datatype*/
-typedef ap_uint<LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
+typedef ap_uint<VTA_LOG_ACC_BUFF_DEPTH+1> acc_idx_T;
 
 /* \typedef opcode_T Opcode datatype*/
-typedef ap_uint<OPCODE_BIT_WIDTH> opcode_T;
+typedef ap_uint<VTA_OPCODE_BIT_WIDTH> opcode_T;
 
 /* \typedef insn_T Instruction datatype*/
-typedef ap_uint<INS_WIDTH> insn_T;
+typedef ap_uint<VTA_INS_WIDTH> insn_T;
 
 /* \typedef loop_T Loop bound datatype*/
-typedef ap_uint<LOOP_ITER_WIDTH> loop_T;
+typedef ap_uint<VTA_LOOP_ITER_WIDTH> loop_T;
 
 /* \typedef memop_id_T Memory operation ID datatype*/
-typedef ap_uint<MEMOP_ID_BIT_WIDTH> memop_id_T;
+typedef ap_uint<VTA_MEMOP_ID_BIT_WIDTH> memop_id_T;
 
 /* \typedef memop_sram_T Memory operation SRAM index datatype*/
-typedef ap_uint<MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
+typedef ap_uint<VTA_MEMOP_SRAM_ADDR_BIT_WIDTH> memop_sram_T;
 
 /* \typedef memop_dram_T Memory operation DRAM index datatype*/
-typedef ap_uint<MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
+typedef ap_uint<VTA_MEMOP_DRAM_ADDR_BIT_WIDTH> memop_dram_T;
 
 /* \typedef memop_size_T Memory operation range datatype*/
-typedef ap_uint<MEMOP_SIZE_BIT_WIDTH> memop_size_T;
+typedef ap_uint<VTA_MEMOP_SIZE_BIT_WIDTH> memop_size_T;
 
 /* \typedef memop_stride_T Memory operation stride datatype*/
-typedef ap_uint<MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
+typedef ap_uint<VTA_MEMOP_STRIDE_BIT_WIDTH> memop_stride_T;
 
 /* \typedef memop_pad_T Memory operation pad width datatype*/
-typedef ap_uint<MEMOP_PAD_BIT_WIDTH> memop_pad_T;
+typedef ap_uint<VTA_MEMOP_PAD_BIT_WIDTH> memop_pad_T;
 
 /* \typedef aluop_opcode_T ALU operation opcode datatype*/
-typedef ap_uint<ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
+typedef ap_uint<VTA_ALU_OPCODE_BIT_WIDTH> aluop_opcode_T;
 
 /* \typedef aluop_opcode_T ALU operation immediate datatype*/
-typedef ap_int<ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
+typedef ap_int<VTA_ALUOP_IMM_BIT_WIDTH> aluop_imm_T;
 
 /* \typedef aluop_opcode_T ALU operation shift immediate datatype*/
-typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
+typedef ap_uint<VTA_LOG_ACC_WIDTH> aluop_sh_imm_T;
 
 /*!
 * \brief Fetch module.
@@ -104,12 +104,12 @@ typedef ap_uint<LOG_ACC_WIDTH> aluop_sh_imm_T;
 * \param gemm_queue GEMM instruction queue. AXI-stream FIFO.
 * \param store_queue Store instruction queue. AXI-stream FIFO.
 */
-void fetch (
+void fetch(
   uint32_t insn_count,
   volatile insn_T *insns,
-  hls::stream<insn_T> &load_queue,
-  hls::stream<insn_T> &gemm_queue,
-  hls::stream<insn_T> &store_queue);
+  hls::stream<insn_T> *load_queue,
+  hls::stream<insn_T> *gemm_queue,
+  hls::stream<insn_T> *store_queue);
 
 /*!
 * \brief Load module.
@@ -126,15 +126,14 @@ void fetch (
 * \param inp_mem Local input SRAM buffer. Write only single port BRAM.
 * \param wgt_mem Local weight SRAM buffer. Write only single port BRAM.
 */
-void load (
+void load(
   volatile inp_vec_T *inputs,
   volatile wgt_vec_T *weights,
-  hls::stream<insn_T> &load_queue,
-  hls::stream<bool> &g2l_dep_queue,
-  hls::stream<bool> &l2g_dep_queue,
-  inp_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
-  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT]
-  );
+  hls::stream<insn_T> *load_queue,
+  hls::stream<bool> *g2l_dep_queue,
+  hls::stream<bool> *l2g_dep_queue,
+  inp_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
+  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT]);
 
 /*!
 * \brief Compute module.
@@ -159,19 +158,18 @@ void load (
 * \param wgt_mem Local weight SRAM buffer. Read only single port BRAM.
 * \param out_mem Local output SRAM buffer. Write only single port BRAM.
 */
-void compute (
-  volatile uint32_t &done,
+void compute(
+  volatile uint32_t *done,
   volatile uop_T *uops,
   volatile acc_vec_T *biases,
-  hls::stream<insn_T> &gemm_queue,
-  hls::stream<bool> &l2g_dep_queue,
-  hls::stream<bool> &s2g_dep_queue,
-  hls::stream<bool> &g2l_dep_queue,
-  hls::stream<bool> &g2s_dep_queue,
-  out_vec_T inp_mem[INP_BUFF_DEPTH][BATCH],
-  wgt_vec_T wgt_mem[WGT_BUFF_DEPTH][BLOCK_OUT],
-  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
-  );
+  hls::stream<insn_T> *gemm_queue,
+  hls::stream<bool> *l2g_dep_queue,
+  hls::stream<bool> *s2g_dep_queue,
+  hls::stream<bool> *g2l_dep_queue,
+  hls::stream<bool> *g2s_dep_queue,
+  out_vec_T inp_mem[VTA_INP_BUFF_DEPTH][VTA_BATCH],
+  wgt_vec_T wgt_mem[VTA_WGT_BUFF_DEPTH][VTA_BLOCK_OUT],
+  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
 
 /*!
 * \brief Store module.
@@ -186,13 +184,12 @@ void compute (
 *   AXI-stream FIFO.
 * \param out_mem Local output SRAM buffer. Read only single port BRAM.
 */
-void store (
+void store(
   volatile out_vec_T *outputs,
-  hls::stream<insn_T> &store_queue,
-  hls::stream<bool> &g2s_dep_queue,
-  hls::stream<bool> &s2g_dep_queue,
-  out_vec_T out_mem[ACC_BUFF_DEPTH][BATCH]
-  );
+  hls::stream<insn_T> *store_queue,
+  hls::stream<bool> *g2s_dep_queue,
+  hls::stream<bool> *s2g_dep_queue,
+  out_vec_T out_mem[VTA_ACC_BUFF_DEPTH][VTA_BATCH]);
 
 /*!
 * \brief VTA wrapper for simulation purpose only.
@@ -205,7 +202,7 @@ void store (
 * \param biases Bias data base address in DRAM. AXI-4 master port.
 * \param outputs Output data base address in DRAM. AXI-4 master port.
 */
-void vta (
+void vta(
   uint32_t insn_count,
   volatile insn_T *insns,
   volatile uop_T *uops,
@@ -214,4 +211,4 @@ void vta (
   volatile acc_vec_T *biases,
   volatile out_vec_T *outputs);
 
-#endif  // VTA_MAIN_H_
\ No newline at end of file
+#endif  // VTA_VTA_H_
diff --git a/vta/include/vta/driver.h b/vta/include/vta/driver.h
index 2b5e0ea936745cc2e140b4310704aee495db279f..c93021d96e4b3dd65797b8c567fc38d3f5da5a8a 100644
--- a/vta/include/vta/driver.h
+++ b/vta/include/vta/driver.h
@@ -14,10 +14,10 @@ extern "C" {
 #include <stdlib.h>
 #include <stdint.h>
 
-/*! \brief Memory management constants with libxlnk_cma */
-#define CACHED 1
-/*! \brief Memory management constants with libxlnk_cma */
-#define NOT_CACHED 0
+/*! \brief Memory management constants */
+#define VTA_CACHED 1
+/*! \brief Memory management constants */
+#define VTA_NOT_CACHED 0
 
 /*! \brief VTA command handle */
 typedef void * VTAHandle;
@@ -97,4 +97,4 @@ void VTAProgram(const char* bitstream);
 #ifdef __cplusplus
 }
 #endif
-#endif // VTA_DRIVER_H_
+#endif  // VTA_DRIVER_H_
diff --git a/vta/include/vta/hw_spec.h b/vta/include/vta/hw_spec.h
index b18e94e63a07e00d080f0961734f109a335c2a19..0c30b344795a1b20a23f0f0abfa7acb7cdc4ce1f 100644
--- a/vta/include/vta/hw_spec.h
+++ b/vta/include/vta/hw_spec.h
@@ -14,150 +14,153 @@ extern "C" {
 #include <stdint.h>
 
 /*! log2 of instruction data type width */
-#define LOG_INS_WIDTH 7
+#define VTA_LOG_INS_WIDTH 7
 /*! Instruction data type width */
-#define INS_WIDTH (1<<LOG_INS_WIDTH)
+#define VTA_INS_WIDTH (1 << VTA_LOG_INS_WIDTH)
 /*! log2 of micro op data type width */
-#define LOG_UOP_WIDTH 5
+#define VTA_LOG_UOP_WIDTH 5
 /*! Micro Op data type width */
-#define UOP_WIDTH (1<<LOG_UOP_WIDTH)
+#define VTA_UOP_WIDTH (1 << VTA_LOG_UOP_WIDTH)
 /*! Weight data type width */
-#define WGT_WIDTH (1<<LOG_WGT_WIDTH)
+#define VTA_WGT_WIDTH (1 << VTA_LOG_WGT_WIDTH)
 /*! Input data type width */
-#define INP_WIDTH (1<<LOG_INP_WIDTH)
+#define VTA_INP_WIDTH (1 << VTA_LOG_INP_WIDTH)
 /*! Output data type width */
-#define OUT_WIDTH (1<<LOG_OUT_WIDTH)
+#define VTA_OUT_WIDTH (1 << VTA_LOG_OUT_WIDTH)
 /*! Accumulator data type width */
-#define ACC_WIDTH (1<<LOG_ACC_WIDTH)
+#define VTA_ACC_WIDTH (1 << VTA_LOG_ACC_WIDTH)
 /*! log2 of ALU data type width */
-#define LOG_ALU_WIDTH (LOG_ACC_WIDTH-1)
+#define VTA_LOG_ALU_WIDTH (VTA_LOG_ACC_WIDTH - 1)
 /*! ALU data type width */
-#define ALU_WIDTH (1<<LOG_ALU_WIDTH)
+#define VTA_ALU_WIDTH (1 << VTA_LOG_ALU_WIDTH)
 
 /*! Batch size (corresponds to A in (A,B)x(B,C) mat mult)*/
-#define BATCH (1<<LOG_BATCH)
+#define VTA_BATCH (1 << VTA_LOG_BATCH)
 /*! Blocking factor of inner most loop (corresponds to B in (A,B)x(B,C) mat mult) */
-#define BLOCK_IN (1<<LOG_BLOCK_IN)
+#define VTA_BLOCK_IN (1 << VTA_LOG_BLOCK_IN)
 /*! Blocking factor of the outer loop (corresponds to C in (A,B)x(B,C) mat mult) */
-#define BLOCK_OUT (1<<LOG_BLOCK_OUT)
+#define VTA_BLOCK_OUT (1 << VTA_LOG_BLOCK_OUT)
 
 /*! Weight vector width */
-#define WGT_VECTOR_WIDTH (WGT_WIDTH*BLOCK_IN)
+#define VTA_WGT_VECTOR_WIDTH (VTA_WGT_WIDTH * VTA_BLOCK_IN)
 /*! Input vector width */
-#define INP_VECTOR_WIDTH (INP_WIDTH*BLOCK_IN)
+#define VTA_INP_VECTOR_WIDTH (VTA_INP_WIDTH * VTA_BLOCK_IN)
 /*! Accumulator vector width */
-#define ACC_VECTOR_WIDTH (ACC_WIDTH*BLOCK_OUT)
+#define VTA_ACC_VECTOR_WIDTH (VTA_ACC_WIDTH * VTA_BLOCK_OUT)
 /*! Output vector width */
-#define OUT_VECTOR_WIDTH (OUT_WIDTH*BLOCK_OUT)
+#define VTA_OUT_VECTOR_WIDTH (VTA_OUT_WIDTH * VTA_BLOCK_OUT)
 
 /*! On-chip micro-op buffer size in B */
-#define UOP_BUFF_SIZE (1<<LOG_UOP_BUFF_SIZE)
+#define VTA_UOP_BUFF_SIZE (1 << VTA_LOG_UOP_BUFF_SIZE)
 /*! On-chip weight buffer size in B */
-#define WGT_BUFF_SIZE (1<<LOG_WGT_BUFF_SIZE)
+#define VTA_WGT_BUFF_SIZE (1 << VTA_LOG_WGT_BUFF_SIZE)
 /*! On-chip activation buffer size in B */
-#define INP_BUFF_SIZE (1<<LOG_INP_BUFF_SIZE)
+#define VTA_INP_BUFF_SIZE (1 << VTA_LOG_INP_BUFF_SIZE)
 /*! On-chip accumulator buffer size in B */
-#define ACC_BUFF_SIZE (1<<LOG_ACC_BUFF_SIZE)
+#define VTA_ACC_BUFF_SIZE (1 << VTA_LOG_ACC_BUFF_SIZE)
 
 /*! Size of instruction buffer element in B */
-#define INS_ELEM_BYTES (INS_WIDTH/8)
+#define VTA_INS_ELEM_BYTES (VTA_INS_WIDTH / 8)
 /*! Size of uop buffer element in B*/
-#define UOP_ELEM_BYTES (UOP_WIDTH/8)
+#define VTA_UOP_ELEM_BYTES (VTA_UOP_WIDTH / 8)
 /*! Size of activation buffer element in B*/
-#define INP_ELEM_BYTES (BATCH*BLOCK_IN*INP_WIDTH/8)
+#define VTA_INP_ELEM_BYTES (VTA_BATCH * VTA_BLOCK_IN * VTA_INP_WIDTH / 8)
 /*! Size of weight buffer element in B*/
-#define WGT_ELEM_BYTES (BLOCK_OUT*BLOCK_IN*WGT_WIDTH/8)
+#define VTA_WGT_ELEM_BYTES (VTA_BLOCK_OUT * VTA_BLOCK_IN * VTA_WGT_WIDTH / 8)
 /*! Size of accumulator buffer element in B*/
-#define ACC_ELEM_BYTES (BATCH*BLOCK_OUT*ACC_WIDTH/8)
+#define VTA_ACC_ELEM_BYTES (VTA_BATCH * VTA_BLOCK_OUT * VTA_ACC_WIDTH / 8)
 
 /*! On-chip micro-op buffer depth */
-#define UOP_BUFF_DEPTH (UOP_BUFF_SIZE/UOP_ELEM_BYTES)
+#define VTA_UOP_BUFF_DEPTH (VTA_UOP_BUFF_SIZE / VTA_UOP_ELEM_BYTES)
 /*! log2 of on-chip micro-op buffer depth */
-#define LOG_UOP_BUFF_DEPTH (LOG_UOP_BUFF_SIZE-LOG_UOP_WIDTH+3)
+#define VTA_LOG_UOP_BUFF_DEPTH (VTA_LOG_UOP_BUFF_SIZE - VTA_LOG_UOP_WIDTH + 3)
 // ! \brief On-chip weight buffer depth
-#define WGT_BUFF_DEPTH (WGT_BUFF_SIZE/WGT_ELEM_BYTES)
+#define VTA_WGT_BUFF_DEPTH (VTA_WGT_BUFF_SIZE / VTA_WGT_ELEM_BYTES)
 /*! log2 of weight micro-op buffer depth */
-#define LOG_WGT_BUFF_DEPTH (LOG_WGT_BUFF_SIZE-LOG_BLOCK_OUT-LOG_BLOCK_IN-LOG_WGT_WIDTH+3)
+#define VTA_LOG_WGT_BUFF_DEPTH \
+    (VTA_LOG_WGT_BUFF_SIZE - VTA_LOG_BLOCK_OUT - VTA_LOG_BLOCK_IN - VTA_LOG_WGT_WIDTH + 3)
 /*! On-chip activation buffer depth */
-#define INP_BUFF_DEPTH (INP_BUFF_SIZE/INP_ELEM_BYTES)
+#define VTA_INP_BUFF_DEPTH (VTA_INP_BUFF_SIZE / VTA_INP_ELEM_BYTES)
 /*! log2 of activation micro-op buffer depth */
-#define LOG_INP_BUFF_DEPTH (LOG_INP_BUFF_SIZE-LOG_BATCH-LOG_BLOCK_IN-LOG_INP_WIDTH+3)
+#define VTA_LOG_INP_BUFF_DEPTH \
+    (VTA_LOG_INP_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_IN - VTA_LOG_INP_WIDTH + 3)
 /*! On-chip accumulator buffer depth */
-#define ACC_BUFF_DEPTH (ACC_BUFF_SIZE/ACC_ELEM_BYTES)
+#define VTA_ACC_BUFF_DEPTH (VTA_ACC_BUFF_SIZE / VTA_ACC_ELEM_BYTES)
 /*! log2 of on-chip accumulator buffer depth */
-#define LOG_ACC_BUFF_DEPTH (LOG_ACC_BUFF_SIZE-LOG_BATCH-LOG_BLOCK_OUT-LOG_ACC_WIDTH+3)
+#define VTA_LOG_ACC_BUFF_DEPTH \
+    (VTA_LOG_ACC_BUFF_SIZE - VTA_LOG_BATCH - VTA_LOG_BLOCK_OUT - VTA_LOG_ACC_WIDTH + 3)
 
 /*! Instruction opcode field bitwidth */
-#define OPCODE_BIT_WIDTH 3
+#define VTA_OPCODE_BIT_WIDTH 3
 /*! ALU opcode field bitwidth */
-#define ALU_OPCODE_BIT_WIDTH 3
+#define VTA_ALU_OPCODE_BIT_WIDTH 3
 /*! ALU instruction reset mode bitwidth */
-#define ALU_RESET_BIT_WIDTH 2
+#define VTA_ALU_RESET_BIT_WIDTH 2
 
 /*! Opcode: load encoding */
-#define OPCODE_LOAD 0
+#define VTA_OPCODE_LOAD 0
 /*! Opcode: store encoding */
-#define OPCODE_STORE 1
+#define VTA_OPCODE_STORE 1
 /*! Opcode: GEMM encoding */
-#define OPCODE_GEMM 2
+#define VTA_OPCODE_GEMM 2
 /*! Opcode: finish encoding */
-#define OPCODE_FINISH 3
+#define VTA_OPCODE_FINISH 3
 /*! Opcode: ALU encoding */
-#define OPCODE_ALU 4
+#define VTA_OPCODE_ALU 4
 
 /*! ALU opcode: unary min op */
-#define ALU_OPCODE_MIN 0
+#define VTA_ALU_OPCODE_MIN 0
 /*! ALU opcode: unary max op */
-#define ALU_OPCODE_MAX 1
+#define VTA_ALU_OPCODE_MAX 1
 /*! ALU opcode: binary add op */
-#define ALU_OPCODE_ADD 2
+#define VTA_ALU_OPCODE_ADD 2
 /*! ALU opcode: binary sub op [NOT IMPLEMENTED] */
-#define ALU_OPCODE_SUB 3
+#define VTA_ALU_OPCODE_SUB 3
 /*! ALU opcode: binary mul op  [NOT IMPLEMENTED] */
-#define ALU_OPCODE_MUL 4
+#define VTA_ALU_OPCODE_MUL 4
 /*! ALU opcode: shift left by immediate op */
-#define ALU_OPCODE_SHL 5
+#define VTA_ALU_OPCODE_SHL 5
 /*! ALU opcode: shift right by immediate op [NOT IMPLEMENTED] */
-#define ALU_OPCODE_SHR 6
+#define VTA_ALU_OPCODE_SHR 6
 
 /*! ALU instruction reset mode: set to min */
-#define ALU_RESET_MIN 3
+#define VTA_ALU_RESET_MIN 3
 /*! ALU instruction reset mode: set to zero */
-#define ALU_RESET_ZERO 0
+#define VTA_ALU_RESET_ZERO 0
 /*! ALU instruction reset mode: no reset */
-#define ALU_NO_RESET 2
+#define VTA_ALU_NO_RESET 2
 /*! ALU instruction reset mode: set to max */
-#define ALU_RESET_MAX 1
+#define VTA_ALU_RESET_MAX 1
 
 /*! Memory type field bitwidth */
-#define MEMOP_ID_BIT_WIDTH 2
+#define VTA_MEMOP_ID_BIT_WIDTH 2
 /*! Load/Store Instruction: DRAM address width*/
-#define MEMOP_SRAM_ADDR_BIT_WIDTH 16
+#define VTA_MEMOP_SRAM_ADDR_BIT_WIDTH 16
 /*! Load/Store Instruction: DRAM address width*/
-#define MEMOP_DRAM_ADDR_BIT_WIDTH 32
+#define VTA_MEMOP_DRAM_ADDR_BIT_WIDTH 32
 /*! Load/Store Instruction: transfer size width*/
-#define MEMOP_SIZE_BIT_WIDTH 16
+#define VTA_MEMOP_SIZE_BIT_WIDTH 16
 /*! Load/Store Instruction: stride size width*/
-#define MEMOP_STRIDE_BIT_WIDTH 16
+#define VTA_MEMOP_STRIDE_BIT_WIDTH 16
 /*! Load/Store Instruction: padding width*/
-#define MEMOP_PAD_BIT_WIDTH 4
+#define VTA_MEMOP_PAD_BIT_WIDTH 4
 /*! Load/Store Instruction: padding value encoding width*/
-#define MEMOP_PAD_VAL_BIT_WIDTH 2
+#define VTA_MEMOP_PAD_VAL_BIT_WIDTH 2
 /*! ALU Instruction: immediate bitwidth*/
-#define ALUOP_IMM_BIT_WIDTH 16
+#define VTA_ALUOP_IMM_BIT_WIDTH 16
 /*! GEMM/ALU Instruction: loop max iter bits */
-#define LOOP_ITER_WIDTH 15
+#define VTA_LOOP_ITER_WIDTH 15
 
 /*! Mem ID constant: uop memory */
-#define MEM_ID_UOP 0
+#define VTA_MEM_ID_UOP 0
 /*! Mem ID constant: weight memory */
-#define MEM_ID_WGT 1
+#define VTA_MEM_ID_WGT 1
 /*! Mem ID constant: input memory */
-#define MEM_ID_INP 2
+#define VTA_MEM_ID_INP 2
 /*! Mem ID constant: accumulator/bias memory */
-#define MEM_ID_ACC 3
+#define VTA_MEM_ID_ACC 3
 /*! Mem ID constant: output store buffer */
-#define MEM_ID_OUT 4
+#define VTA_MEM_ID_OUT 4
 
 // Instruction organization layout:
 //
@@ -218,152 +221,152 @@ extern "C" {
 // arg f: imm                   | alu_imm_T         |
 
 /*! Load/Store instruction start position of the opcode field */
-#define INSN_MEM_0_0 0
+#define VTA_INSN_MEM_0_0 0
 /*! Load/Store instruction end position of the opcode field */
-#define INSN_MEM_0_1 (INSN_MEM_0_0+OPCODE_BIT_WIDTH-1)
+#define VTA_INSN_MEM_0_1 (VTA_INSN_MEM_0_0 + VTA_OPCODE_BIT_WIDTH - 1)
 /*! Load/Store instruction position of the pop_prev_dep field */
-#define INSN_MEM_1   (INSN_MEM_0_1+1)
+#define VTA_INSN_MEM_1   (VTA_INSN_MEM_0_1 + 1)
 /*! Load/Store instruction position of the pop_next_dep field */
-#define INSN_MEM_2   (INSN_MEM_1+1)
+#define VTA_INSN_MEM_2   (VTA_INSN_MEM_1 + 1)
 /*! Load/Store instruction position of the push_prev_dependence field */
-#define INSN_MEM_3   (INSN_MEM_2+1)
+#define VTA_INSN_MEM_3   (VTA_INSN_MEM_2 + 1)
 /*! Load/Store instruction position of the push_next_dependence field */
-#define INSN_MEM_4   (INSN_MEM_3+1)
+#define VTA_INSN_MEM_4   (VTA_INSN_MEM_3 + 1)
 /*! Load/Store instruction start position of the memory_type field */
-#define INSN_MEM_5_0 (INSN_MEM_4+1)
+#define VTA_INSN_MEM_5_0 (VTA_INSN_MEM_4 + 1)
 /*! Load/Store instruction end position of the memory_type field */
-#define INSN_MEM_5_1 (INSN_MEM_5_0+MEMOP_ID_BIT_WIDTH-1)
+#define VTA_INSN_MEM_5_1 (VTA_INSN_MEM_5_0 + VTA_MEMOP_ID_BIT_WIDTH - 1)
 /*! Load/Store instruction start position of the sram_base field */
-#define INSN_MEM_6_0 (INSN_MEM_5_1+1)
+#define VTA_INSN_MEM_6_0 (VTA_INSN_MEM_5_1 + 1)
 /*! Load/Store instruction end position of the sram_base field */
-#define INSN_MEM_6_1 (INSN_MEM_6_0+MEMOP_SRAM_ADDR_BIT_WIDTH-1)
+#define VTA_INSN_MEM_6_1 (VTA_INSN_MEM_6_0 + VTA_MEMOP_SRAM_ADDR_BIT_WIDTH - 1)
 /*! Load/Store instruction start position of the dram_base field */
-#define INSN_MEM_7_0 (INSN_MEM_6_1+1)
+#define VTA_INSN_MEM_7_0 (VTA_INSN_MEM_6_1 + 1)
 /*! Load/Store instruction end position of the dram_base field */
-#define INSN_MEM_7_1 (INSN_MEM_7_0+MEMOP_DRAM_ADDR_BIT_WIDTH-1)
+#define VTA_INSN_MEM_7_1 (VTA_INSN_MEM_7_0 + VTA_MEMOP_DRAM_ADDR_BIT_WIDTH - 1)
 /*! Load/Store instruction start position of the y_size field */
-#define INSN_MEM_8_0 64
+#define VTA_INSN_MEM_8_0 64
 /*! Load/Store instruction end position of the y_size field */
-#define INSN_MEM_8_1 (INSN_MEM_8_0+MEMOP_SIZE_BIT_WIDTH-1)
+#define VTA_INSN_MEM_8_1 (VTA_INSN_MEM_8_0 + VTA_MEMOP_SIZE_BIT_WIDTH - 1)
 /*! Load/Store instruction start position of the x_size field */
-#define INSN_MEM_9_0 (INSN_MEM_8_1+1)
+#define VTA_INSN_MEM_9_0 (VTA_INSN_MEM_8_1 + 1)
 /*! Load/Store instruction start position of the x_size field */
-#define INSN_MEM_9_1 (INSN_MEM_9_0+MEMOP_SIZE_BIT_WIDTH-1)
+#define VTA_INSN_MEM_9_1 (VTA_INSN_MEM_9_0 + VTA_MEMOP_SIZE_BIT_WIDTH - 1)
 /*! Load/Store instruction start position of the x_stride field */
-#define INSN_MEM_A_0 (INSN_MEM_9_1+1)
+#define VTA_INSN_MEM_A_0 (VTA_INSN_MEM_9_1 + 1)
 /*! Load/Store instruction end position of the x_stride field */
-#define INSN_MEM_A_1 (INSN_MEM_A_0+MEMOP_STRIDE_BIT_WIDTH-1)
+#define VTA_INSN_MEM_A_1 (VTA_INSN_MEM_A_0 + VTA_MEMOP_STRIDE_BIT_WIDTH - 1)
 /*! Load/Store instruction start position of the y_pad_0 field */
-#define INSN_MEM_B_0 (INSN_MEM_A_1+1)
+#define VTA_INSN_MEM_B_0 (VTA_INSN_MEM_A_1 + 1)
 /*! Load/Store instruction start position of the y_pad_0 field */
-#define INSN_MEM_B_1 (INSN_MEM_B_0+MEMOP_PAD_BIT_WIDTH-1)
+#define VTA_INSN_MEM_B_1 (VTA_INSN_MEM_B_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
 /*! Load/Store instruction start position of the y_pad_1 field */
-#define INSN_MEM_C_0 (INSN_MEM_B_1+1)
+#define VTA_INSN_MEM_C_0 (VTA_INSN_MEM_B_1 + 1)
 /*! Load/Store instruction start position of the y_pad_1 field */
-#define INSN_MEM_C_1 (INSN_MEM_C_0+MEMOP_PAD_BIT_WIDTH-1)
+#define VTA_INSN_MEM_C_1 (VTA_INSN_MEM_C_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
 /*! Load/Store instruction start position of the x_pad_0 field */
-#define INSN_MEM_D_0 (INSN_MEM_C_1+1)
+#define VTA_INSN_MEM_D_0 (VTA_INSN_MEM_C_1 + 1)
 /*! Load/Store instruction start position of the x_pad_0 field */
-#define INSN_MEM_D_1 (INSN_MEM_D_0+MEMOP_PAD_BIT_WIDTH-1)
+#define VTA_INSN_MEM_D_1 (VTA_INSN_MEM_D_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
 /*! Load/Store instruction start position of the x_pad_1 field */
-#define INSN_MEM_E_0 (INSN_MEM_D_1+1)
+#define VTA_INSN_MEM_E_0 (VTA_INSN_MEM_D_1 + 1)
 /*! Load/Store instruction start position of the x_pad_1 field */
-#define INSN_MEM_E_1 (INSN_MEM_E_0+MEMOP_PAD_BIT_WIDTH-1)
+#define VTA_INSN_MEM_E_1 (VTA_INSN_MEM_E_0 + VTA_MEMOP_PAD_BIT_WIDTH - 1)
 
 /*! GEMM instruction start position of the opcode field */
-#define INSN_GEM_0_0 0
+#define VTA_INSN_GEM_0_0 0
 /*! GEMM instruction end position of the opcode field */
-#define INSN_GEM_0_1 (INSN_GEM_0_0+OPCODE_BIT_WIDTH-1)
+#define VTA_INSN_GEM_0_1 (VTA_INSN_GEM_0_0 + VTA_OPCODE_BIT_WIDTH - 1)
 /*! GEMM instruction position of the pop_prev_dep field */
-#define INSN_GEM_1   (INSN_GEM_0_1+1)
+#define VTA_INSN_GEM_1   (VTA_INSN_GEM_0_1 + 1)
 /*! GEMM instruction position of the pop_next_dep field */
-#define INSN_GEM_2   (INSN_GEM_1+1)
+#define VTA_INSN_GEM_2   (VTA_INSN_GEM_1 + 1)
 /*! GEMM instruction position of the push_prev_dependence field */
-#define INSN_GEM_3   (INSN_GEM_2+1)
+#define VTA_INSN_GEM_3   (VTA_INSN_GEM_2 + 1)
 /*! GEMM instruction position of the push_next_dependence field */
-#define INSN_GEM_4   (INSN_GEM_3+1)
+#define VTA_INSN_GEM_4   (VTA_INSN_GEM_3 + 1)
 /*! GEMM instruction start position of the uop_bgn field */
-#define INSN_GEM_5_0 (INSN_GEM_4+1)
+#define VTA_INSN_GEM_5_0 (VTA_INSN_GEM_4 + 1)
 /*! GEMM instruction end position of the uop_bgn field */
-#define INSN_GEM_5_1 (INSN_GEM_5_0+LOG_UOP_BUFF_DEPTH-1)
+#define VTA_INSN_GEM_5_1 (VTA_INSN_GEM_5_0 + VTA_LOG_UOP_BUFF_DEPTH - 1)
 /*! GEMM instruction start position of the uop_end field */
-#define INSN_GEM_6_0 (INSN_GEM_5_1+1)
+#define VTA_INSN_GEM_6_0 (VTA_INSN_GEM_5_1 + 1)
 /*! GEMM instruction end position of the uop_end field */
-#define INSN_GEM_6_1 (INSN_GEM_6_0+LOG_UOP_BUFF_DEPTH+1-1)
+#define VTA_INSN_GEM_6_1 (VTA_INSN_GEM_6_0 + VTA_LOG_UOP_BUFF_DEPTH + 1 - 1)
 /*! GEMM instruction start position of the iter_out field */
-#define INSN_GEM_7_0 (INSN_GEM_6_1+1)
+#define VTA_INSN_GEM_7_0 (VTA_INSN_GEM_6_1 + 1)
 /*! GEMM instruction end position of the iter_out field */
-#define INSN_GEM_7_1 (INSN_GEM_7_0+LOOP_ITER_WIDTH-1)
+#define VTA_INSN_GEM_7_1 (VTA_INSN_GEM_7_0 + VTA_LOOP_ITER_WIDTH - 1)
 /*! GEMM instruction start position of the iter_in field */
-#define INSN_GEM_8_0 (INSN_GEM_7_1+1)
+#define VTA_INSN_GEM_8_0 (VTA_INSN_GEM_7_1 + 1)
 /*! GEMM instruction end position of the iter_in field */
-#define INSN_GEM_8_1 (INSN_GEM_8_0+LOOP_ITER_WIDTH-1)
+#define VTA_INSN_GEM_8_1 (VTA_INSN_GEM_8_0 + VTA_LOOP_ITER_WIDTH - 1)
 /*! GEMM instruction start position of the dst_factor_out field */
-#define INSN_GEM_9_0 64
+#define VTA_INSN_GEM_9_0 64
 /*! GEMM instruction end position of the dst_factor_out field */
-#define INSN_GEM_9_1 (INSN_GEM_9_0+LOG_ACC_BUFF_DEPTH-1)
+#define VTA_INSN_GEM_9_1 (VTA_INSN_GEM_9_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
 /*! GEMM instruction start position of the dst_factor_in field */
-#define INSN_GEM_A_0 (INSN_GEM_9_1+1)
+#define VTA_INSN_GEM_A_0 (VTA_INSN_GEM_9_1 + 1)
 /*! GEMM instruction end position of the dst_factor_in field */
-#define INSN_GEM_A_1 (INSN_GEM_A_0+LOG_ACC_BUFF_DEPTH-1)
+#define VTA_INSN_GEM_A_1 (VTA_INSN_GEM_A_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
 /*! GEMM instruction start position of the src_factor_out field */
-#define INSN_GEM_B_0 (INSN_GEM_A_1+1)
+#define VTA_INSN_GEM_B_0 (VTA_INSN_GEM_A_1 + 1)
 /*! GEMM instruction end position of the src_factor_out field */
-#define INSN_GEM_B_1 (INSN_GEM_B_0+LOG_ACC_BUFF_DEPTH-1)
+#define VTA_INSN_GEM_B_1 (VTA_INSN_GEM_B_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
 /*! GEMM instruction start position of the src_factor_in field */
-#define INSN_GEM_C_0 (INSN_GEM_B_1+1)
+#define VTA_INSN_GEM_C_0 (VTA_INSN_GEM_B_1 + 1)
 /*! GEMM instruction end position of the src_factor_in field */
-#define INSN_GEM_C_1 (INSN_GEM_C_0+LOG_ACC_BUFF_DEPTH-1)
+#define VTA_INSN_GEM_C_1 (VTA_INSN_GEM_C_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
 
 /*! GEMM instruction start position of the wgt_factor_out field */
-#define INSN_GEM_D_0 (INSN_GEM_C_1+1)
+#define VTA_INSN_GEM_D_0 (VTA_INSN_GEM_C_1 + 1)
 /*! GEMM instruction end position of the wgt_factor_out field */
-#define INSN_GEM_D_1 (INSN_GEM_D_0+LOG_WGT_BUFF_DEPTH-1)
+#define VTA_INSN_GEM_D_1 (VTA_INSN_GEM_D_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
 /*! GEMM instruction start position of the wgt_factor_in field */
-#define INSN_GEM_E_0 (INSN_GEM_D_1+1)
+#define VTA_INSN_GEM_E_0 (VTA_INSN_GEM_D_1 + 1)
 /*! GEMM instruction end position of the wgt_factor_in field */
-#define INSN_GEM_E_1 (INSN_GEM_E_0+LOG_WGT_BUFF_DEPTH-1)
+#define VTA_INSN_GEM_E_1 (VTA_INSN_GEM_E_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
 
 /*! ALU instruction start position of the alu_opcode field */
-#define INSN_ALU_D_0 (INSN_GEM_C_1+1)
+#define VTA_INSN_ALU_D_0 (VTA_INSN_GEM_C_1 + 1)
 /*! ALU instruction end position of the alu_opcode field */
-#define INSN_ALU_D_1 (INSN_ALU_D_0+ALU_OPCODE_BIT_WIDTH-1)
+#define VTA_INSN_ALU_D_1 (VTA_INSN_ALU_D_0 + VTA_ALU_OPCODE_BIT_WIDTH - 1)
 /*! ALU instruction position of the use_imm field */
-#define INSN_ALU_E   (INSN_ALU_D_1+1)
+#define VTA_INSN_ALU_E   (VTA_INSN_ALU_D_1 + 1)
 /*! ALU instruction start position of the immediate field */
-#define INSN_ALU_F_0 (INSN_ALU_E+1)
+#define VTA_INSN_ALU_F_0 (VTA_INSN_ALU_E + 1)
 /*! ALU instruction end position of the immediate field */
-#define INSN_ALU_F_1 (INSN_ALU_F_0+ALUOP_IMM_BIT_WIDTH-1)
+#define VTA_INSN_ALU_F_1 (VTA_INSN_ALU_F_0 + VTA_ALUOP_IMM_BIT_WIDTH - 1)
 
 /*! GEMM Micro-op position of the reset_out field */
-#define UOP_GEM_0 0
+#define VTA_UOP_GEM_0 0
 /*! GEMM Micro-op start position of the acc_idx field */
-#define UOP_GEM_1_0 (UOP_GEM_0+1)
+#define VTA_UOP_GEM_1_0 (VTA_UOP_GEM_0 + 1)
 /*! GEMM Micro-op end position of the acc_idx field */
-#define UOP_GEM_1_1 (UOP_GEM_1_0+LOG_ACC_BUFF_DEPTH-1)
+#define VTA_UOP_GEM_1_1 (VTA_UOP_GEM_1_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
 /*! GEMM Micro-op start position of the inp_idx field */
-#define UOP_GEM_2_0 (UOP_GEM_1_1+1)
+#define VTA_UOP_GEM_2_0 (VTA_UOP_GEM_1_1 + 1)
 /*! GEMM Micro-op end position of the inp_idx field */
-#define UOP_GEM_2_1 (UOP_GEM_2_0+LOG_ACC_BUFF_DEPTH-1)
+#define VTA_UOP_GEM_2_1 (VTA_UOP_GEM_2_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
 /*! GEMM Micro-op start position of the wgt_idx field */
-#define UOP_GEM_3_0 (UOP_GEM_2_1+1)
+#define VTA_UOP_GEM_3_0 (VTA_UOP_GEM_2_1 + 1)
 /*! GEMM Micro-op end position of the wgt_idx field */
-#define UOP_GEM_3_1 (UOP_GEM_3_0+LOG_WGT_BUFF_DEPTH-1)
+#define VTA_UOP_GEM_3_1 (VTA_UOP_GEM_3_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
 
 /*! GEMM Micro-op position of the reset_out field */
-#define UOP_ALU_0 0
+#define VTA_UOP_ALU_0 0
 /*! GEMM Micro-op start position of the acc_idx field */
-#define UOP_ALU_1_0 (UOP_ALU_0+1)
+#define VTA_UOP_ALU_1_0 (VTA_UOP_ALU_0 + 1)
 /*! GEMM Micro-op end position of the acc_idx field */
-#define UOP_ALU_1_1 (UOP_ALU_1_0+LOG_ACC_BUFF_DEPTH-1)
+#define VTA_UOP_ALU_1_1 (VTA_UOP_ALU_1_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
 /*! GEMM Micro-op start position of the inp_idx field */
-#define UOP_ALU_2_0 (UOP_ALU_1_1+1)
+#define VTA_UOP_ALU_2_0 (VTA_UOP_ALU_1_1 + 1)
 /*! GEMM Micro-op end position of the inp_idx field */
-#define UOP_ALU_2_1 (UOP_ALU_2_0+LOG_ACC_BUFF_DEPTH-1)
+#define VTA_UOP_ALU_2_1 (VTA_UOP_ALU_2_0 + VTA_LOG_ACC_BUFF_DEPTH - 1)
 /*! GEMM Micro-op start position of the wgt_idx field */
-#define UOP_ALU_3_0 (UOP_ALU_2_1+1)
+#define VTA_UOP_ALU_3_0 (VTA_UOP_ALU_2_1 + 1)
 /*! GEMM Micro-op end position of the wgt_idx field */
-#define UOP_ALU_3_1 (UOP_ALU_3_0+LOG_WGT_BUFF_DEPTH-1)
+#define VTA_UOP_ALU_3_1 (VTA_UOP_ALU_3_0 + VTA_LOG_WGT_BUFF_DEPTH - 1)
 
 /*! \brief VTA generic instruction */
 typedef struct {
@@ -382,7 +385,7 @@ typedef struct {
 */
 typedef struct {
   /*! \brief The instruction opcode */
-  uint64_t opcode         : OPCODE_BIT_WIDTH;
+  uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
   /*! \brief Unused in this instruction */
   uint64_t pop_prev_dep   : 1;
   /*! \brief Pop dependence token from GEMM stage */
@@ -392,25 +395,25 @@ typedef struct {
   /*! \brief Push dependence token to GEMM stage */
   uint64_t push_next_dep  : 1;
   /*! \brief Source/destination SRAM for store/load instruction */
-  uint64_t memory_type    : MEMOP_ID_BIT_WIDTH;
+  uint64_t memory_type    : VTA_MEMOP_ID_BIT_WIDTH;
   /*! \brief SRAM base address (pointer to memory elem type) */
-  uint64_t sram_base      : MEMOP_SRAM_ADDR_BIT_WIDTH;
+  uint64_t sram_base      : VTA_MEMOP_SRAM_ADDR_BIT_WIDTH;
   /*! \brief DRAM base address (pointer to memory elem type) */
-  uint64_t dram_base      : MEMOP_DRAM_ADDR_BIT_WIDTH;
+  uint64_t dram_base      : VTA_MEMOP_DRAM_ADDR_BIT_WIDTH;
   /*! \brief 2D access pattern: y-size */
-  uint64_t y_size         : MEMOP_SIZE_BIT_WIDTH;
+  uint64_t y_size         : VTA_MEMOP_SIZE_BIT_WIDTH;
   /*! \brief 2D access pattern: x-size (in terms of memory elements) */
-  uint64_t x_size         : MEMOP_SIZE_BIT_WIDTH;
+  uint64_t x_size         : VTA_MEMOP_SIZE_BIT_WIDTH;
   /*! \brief 2D access pattern: x-stride (in terms of memory elements) */
-  uint64_t x_stride       : MEMOP_STRIDE_BIT_WIDTH;
+  uint64_t x_stride       : VTA_MEMOP_STRIDE_BIT_WIDTH;
   /*! \brief 2D access pattern: start padding along y dimension */
-  uint64_t y_pad_0        : MEMOP_PAD_BIT_WIDTH;
+  uint64_t y_pad_0        : VTA_MEMOP_PAD_BIT_WIDTH;
   /*! \brief 2D access pattern: end padding along y dimension */
-  uint64_t y_pad_1        : MEMOP_PAD_BIT_WIDTH;
+  uint64_t y_pad_1        : VTA_MEMOP_PAD_BIT_WIDTH;
   /*! \brief 2D access pattern: start padding along x dimension */
-  uint64_t x_pad_0        : MEMOP_PAD_BIT_WIDTH;
+  uint64_t x_pad_0        : VTA_MEMOP_PAD_BIT_WIDTH;
   /*! \brief 2D access pattern: end padding along x dimension */
-  uint64_t x_pad_1        : MEMOP_PAD_BIT_WIDTH;
+  uint64_t x_pad_1        : VTA_MEMOP_PAD_BIT_WIDTH;
 } VTAMemInsn;
 
 /*! \brief VTA GEMM instruction
@@ -442,7 +445,7 @@ typedef struct {
 */
 typedef struct {
   /*! \brief The instruction opcode */
-  uint64_t opcode         : OPCODE_BIT_WIDTH;
+  uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
   /*! \brief Pop dependence token from load stage */
   uint64_t pop_prev_dep   : 1;
   /*! \brief Pop dependence token from store stage */
@@ -452,25 +455,25 @@ typedef struct {
   /*! \brief Push dependence token to store stage */
   uint64_t push_next_dep  : 1;
   /*! \brief Micro-op begin address */
-  uint64_t uop_bgn        : LOG_UOP_BUFF_DEPTH;
+  uint64_t uop_bgn        : VTA_LOG_UOP_BUFF_DEPTH;
   /*! \brief Micro-op end address */
-  uint64_t uop_end        : LOG_UOP_BUFF_DEPTH+1;
+  uint64_t uop_end        : VTA_LOG_UOP_BUFF_DEPTH+1;
   /*! \brief Iterations in the outer uop execution loop */
-  uint64_t iter_out       : LOOP_ITER_WIDTH;
+  uint64_t iter_out       : VTA_LOOP_ITER_WIDTH;
   /*! \brief Iterations in the inner uop execution loop */
-  uint64_t iter_in        : LOOP_ITER_WIDTH;
+  uint64_t iter_in        : VTA_LOOP_ITER_WIDTH;
   /*! \brief Outer loop accumulator memory index factor */
-  uint64_t dst_factor_out : LOG_ACC_BUFF_DEPTH;
+  uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Inner loop accumulator memory index factor */
-  uint64_t dst_factor_in  : LOG_ACC_BUFF_DEPTH;
+  uint64_t dst_factor_in  : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Outer loop input memory index factor */
-  uint64_t src_factor_out : LOG_ACC_BUFF_DEPTH;
+  uint64_t src_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Inner loop input memory index factor */
-  uint64_t src_factor_in  : LOG_ACC_BUFF_DEPTH;
+  uint64_t src_factor_in  : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Outer loop weight memory index factor */
-  uint64_t wgt_factor_out : LOG_WGT_BUFF_DEPTH;
+  uint64_t wgt_factor_out : VTA_LOG_WGT_BUFF_DEPTH;
   /*! \brief Inner loop weight memory index factor */
-  uint64_t wgt_factor_in  : LOG_WGT_BUFF_DEPTH;
+  uint64_t wgt_factor_in  : VTA_LOG_WGT_BUFF_DEPTH;
 } VTAGemInsn;
 
 /*! \brief VTA ALU instruction
@@ -504,7 +507,7 @@ typedef struct {
 */
 typedef struct {
   /*! \brief The instruction opcode */
-  uint64_t opcode         : OPCODE_BIT_WIDTH;
+  uint64_t opcode         : VTA_OPCODE_BIT_WIDTH;
   /*! \brief Pop dependence token from load stage */
   uint64_t pop_prev_dep   : 1;
   /*! \brief Pop dependence token from store stage */
@@ -514,27 +517,27 @@ typedef struct {
   /*! \brief Push dependence token to store stage */
   uint64_t push_next_dep  : 1;
   /*! \brief Micro-op begin address */
-  uint64_t uop_bgn        : LOG_UOP_BUFF_DEPTH;
+  uint64_t uop_bgn        : VTA_LOG_UOP_BUFF_DEPTH;
   /*! \brief Micro-op end address */
-  uint64_t uop_end        : LOG_UOP_BUFF_DEPTH+1;
+  uint64_t uop_end        : VTA_LOG_UOP_BUFF_DEPTH+1;
   /*! \brief Iterations in the outer uop execution loop */
-  uint64_t iter_out       : LOOP_ITER_WIDTH;
+  uint64_t iter_out       : VTA_LOOP_ITER_WIDTH;
   /*! \brief Iterations in the inner uop execution loop */
-  uint64_t iter_in        : LOOP_ITER_WIDTH;
+  uint64_t iter_in        : VTA_LOOP_ITER_WIDTH;
   /*! \brief Outer loop accumulator memory destination index factor */
-  uint64_t dst_factor_out : LOG_ACC_BUFF_DEPTH;
+  uint64_t dst_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Inner loop accumulator memory destination index factor */
-  uint64_t dst_factor_in  : LOG_ACC_BUFF_DEPTH;
+  uint64_t dst_factor_in  : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Outer loop accumulator memory source index factor */
-  uint64_t src_factor_out : LOG_ACC_BUFF_DEPTH;
+  uint64_t src_factor_out : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Inner loop accumulator memory source index factor */
-  uint64_t src_factor_in  : LOG_ACC_BUFF_DEPTH;
+  uint64_t src_factor_in  : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief ALU opcode */
-  uint64_t alu_opcode     : ALU_OPCODE_BIT_WIDTH;
+  uint64_t alu_opcode     : VTA_ALU_OPCODE_BIT_WIDTH;
   /*! \brief Use immediate is true */
   uint64_t use_imm        : 1;
   /*! \brief Immediate value */
-  uint64_t imm            : ALUOP_IMM_BIT_WIDTH;
+  uint64_t imm            : VTA_ALUOP_IMM_BIT_WIDTH;
 } VTAAluInsn;
 
 /*! \brief VTA ALU instruction converter */
@@ -554,14 +557,14 @@ typedef struct {
   /*! \brief Initialize acc_mem at index dst_idx to 0*/
   uint32_t reset_out  : 1;
   /*! \brief Destination index (indexes accum buffer) */
-  uint32_t dst_idx    : LOG_ACC_BUFF_DEPTH;
+  uint32_t dst_idx    : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Source index (indexes input buffer for GEMM or accum buffer for ALU) */
-  uint32_t src_idx    : LOG_ACC_BUFF_DEPTH;
+  uint32_t src_idx    : VTA_LOG_ACC_BUFF_DEPTH;
   /*! \brief Weight index (indexes weight buffer) */
-  uint32_t wgt_idx    : LOG_WGT_BUFF_DEPTH;
+  uint32_t wgt_idx    : VTA_LOG_WGT_BUFF_DEPTH;
 } VTAUop;
 
 #ifdef __cplusplus
 }
 #endif
-#endif // VTA_HW_SPEC_H_
+#endif  // VTA_HW_SPEC_H_
diff --git a/vta/make/config.mk b/vta/make/config.mk
index 8142571a1cca342849a01e067f41ac0e34a94116..06143d7771672b163b8cabce3439db7e01329934 100644
--- a/vta/make/config.mk
+++ b/vta/make/config.mk
@@ -27,70 +27,72 @@ ADD_LDFLAGS=
 ADD_CFLAGS=
 
 # the hardware target
-TARGET=PYNQ_TARGET
+TARGET = VTA_PYNQ_TARGET
 
 #---------------------
 # VTA hardware parameters
 #--------------------
 
 #  Log of input/activation width in bits (default 3 -> 8 bits)
-LOG_INP_WIDTH = 3
+VTA_LOG_INP_WIDTH = 3
 #  Log of kernel weight width in bits (default 3 -> 8 bits)
-LOG_WGT_WIDTH = 3
+VTA_LOG_WGT_WIDTH = 3
 #  Log of accum width in bits (default 5 -> 32 bits)
-LOG_ACC_WIDTH = 5
+VTA_LOG_ACC_WIDTH = 5
 #  Log of tensor batch size (A in (A,B)x(B,C) matrix multiplication)
-LOG_BATCH = 0
+VTA_LOG_BATCH = 0
 #  Log of tensor inner block size (B in (A,B)x(B,C) matrix multiplication)
-LOG_BLOCK_IN = 4
+VTA_LOG_BLOCK_IN = 4
 #  Log of tensor outer block size (C in (A,B)x(B,C) matrix multiplication)
-LOG_BLOCK_OUT = 4
+VTA_LOG_BLOCK_OUT = 4
 #  Log of uop buffer size in Bytes
-LOG_UOP_BUFF_SIZE = 15
+VTA_LOG_UOP_BUFF_SIZE = 15
 #  Log of inp buffer size in Bytes
-LOG_INP_BUFF_SIZE = 15
+VTA_LOG_INP_BUFF_SIZE = 15
 #  Log of wgt buffer size in Bytes
-LOG_WGT_BUFF_SIZE = 15
+VTA_LOG_WGT_BUFF_SIZE = 15
 #  Log of acc buffer size in Bytes
-LOG_ACC_BUFF_SIZE = 17
+VTA_LOG_ACC_BUFF_SIZE = 17
 
 #---------------------
 # Derived VTA hardware parameters
 #--------------------
 
 #  Input width in bits
-INP_WIDTH = $(shell echo "$$(( 1 << $(LOG_INP_WIDTH) ))" )
+VTA_INP_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_INP_WIDTH) ))" )
 #  Weight width in bits
-WGT_WIDTH = $(shell echo "$$(( 1 << $(LOG_WGT_WIDTH) ))" )
+VTA_WGT_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_WGT_WIDTH) ))" )
 #  Log of output width in bits
-LOG_OUT_WIDTH = $(LOG_INP_WIDTH)
+VTA_LOG_OUT_WIDTH = $(VTA_LOG_INP_WIDTH)
 #  Output width in bits
-OUT_WIDTH = $(shell echo "$$(( 1 << $(LOG_OUT_WIDTH) ))" )
+VTA_OUT_WIDTH = $(shell echo "$$(( 1 << $(VTA_LOG_OUT_WIDTH) ))" )
 #  Tensor batch size
-BATCH = $(shell echo "$$(( 1 << $(LOG_BATCH) ))" )
+VTA_BATCH = $(shell echo "$$(( 1 << $(VTA_LOG_BATCH) ))" )
 #  Tensor outer block size
-IN_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_IN) ))" )
+VTA_IN_BLOCK = $(shell echo "$$(( 1 << $(VTA_LOG_BLOCK_IN) ))" )
 #  Tensor inner block size
-OUT_BLOCK = $(shell echo "$$(( 1 << $(LOG_BLOCK_OUT) ))" )
+VTA_OUT_BLOCK = $(shell echo "$$(( 1 << $(VTA_LOG_BLOCK_OUT) ))" )
 #  Uop buffer size in Bytes
-UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_UOP_BUFF_SIZE) ))" )
+VTA_UOP_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_UOP_BUFF_SIZE) ))" )
 #  Inp buffer size in Bytes
-INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_INP_BUFF_SIZE) ))" )
+VTA_INP_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_INP_BUFF_SIZE) ))" )
 #  Wgt buffer size in Bytes
-WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_WGT_BUFF_SIZE) ))" )
+VTA_WGT_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_WGT_BUFF_SIZE) ))" )
 #  Acc buffer size in Bytes
-ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_ACC_BUFF_SIZE) ))" )
+VTA_ACC_BUFF_SIZE = $(shell echo "$$(( 1 << $(VTA_LOG_ACC_BUFF_SIZE) ))" )
 #  Log of out buffer size in Bytes
-LOG_OUT_BUFF_SIZE = $(shell echo "$$(( $(LOG_ACC_BUFF_SIZE)+$(LOG_OUT_WIDTH)-$(LOG_ACC_WIDTH) ))" )
+VTA_LOG_OUT_BUFF_SIZE = \
+$(shell echo "$$(( $(VTA_LOG_ACC_BUFF_SIZE) + $(VTA_LOG_OUT_WIDTH) - $(VTA_LOG_ACC_WIDTH) ))" )
 #  Out buffer size in Bytes
-OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
+VTA_OUT_BUFF_SIZE = $(shell echo "$$(( 1 << $(LOG_OUT_BUFF_SIZE) ))" )
 
 # Update ADD_CFLAGS
 ADD_CFLAGS += \
 	-D$(TARGET) \
-	-DLOG_WGT_WIDTH=$(LOG_WGT_WIDTH) -DLOG_INP_WIDTH=$(LOG_INP_WIDTH) \
-	-DLOG_ACC_WIDTH=$(LOG_ACC_WIDTH) -DLOG_OUT_WIDTH=$(LOG_OUT_WIDTH) \
-	-DLOG_BATCH=$(LOG_BATCH) -DLOG_BLOCK_IN=$(LOG_BLOCK_IN) -DLOG_BLOCK_OUT=$(LOG_BLOCK_OUT) \
-	-DLOG_UOP_BUFF_SIZE=$(LOG_UOP_BUFF_SIZE) -DLOG_INP_BUFF_SIZE=$(LOG_INP_BUFF_SIZE) \
-	-DLOG_WGT_BUFF_SIZE=$(LOG_WGT_BUFF_SIZE) -DLOG_ACC_BUFF_SIZE=$(LOG_ACC_BUFF_SIZE) \
-	-DLOG_OUT_BUFF_SIZE=$(LOG_OUT_BUFF_SIZE)
\ No newline at end of file
+	-DVTA_LOG_WGT_WIDTH=$(VTA_LOG_WGT_WIDTH) -DVTA_LOG_INP_WIDTH=$(VTA_LOG_INP_WIDTH) \
+	-DVTA_LOG_ACC_WIDTH=$(VTA_LOG_ACC_WIDTH) -DVTA_LOG_OUT_WIDTH=$(VTA_LOG_OUT_WIDTH) \
+	-DVTA_LOG_BATCH=$(VTA_LOG_BATCH) \
+	-DVTA_LOG_BLOCK_IN=$(VTA_LOG_BLOCK_IN) -DVTA_LOG_BLOCK_OUT=$(VTA_LOG_BLOCK_OUT) \
+	-DVTA_LOG_UOP_BUFF_SIZE=$(VTA_LOG_UOP_BUFF_SIZE) -DVTA_LOG_INP_BUFF_SIZE=$(VTA_LOG_INP_BUFF_SIZE) \
+	-DVTA_LOG_WGT_BUFF_SIZE=$(VTA_LOG_WGT_BUFF_SIZE) -DVTA_LOG_ACC_BUFF_SIZE=$(VTA_LOG_ACC_BUFF_SIZE) \
+	-DVTA_LOG_OUT_BUFF_SIZE=$(VTA_LOG_OUT_BUFF_SIZE)
diff --git a/vta/src/pynq/pynq_driver.cc b/vta/src/pynq/pynq_driver.cc
index b4f78db0c160a75470bfd5c771c6b2967f29e4e5..1787af8da5268300951d8c2112c2b358ae7b74a9 100644
--- a/vta/src/pynq/pynq_driver.cc
+++ b/vta/src/pynq/pynq_driver.cc
@@ -29,65 +29,61 @@ void VTAInvalidateCache(void* buf, int size) {
 }
 
 void *VTAMapRegister(uint32_t addr, size_t length) {
-
   // Align the base address with the pages
   uint32_t virt_base = addr & ~(getpagesize() - 1);
   // Calculate base address offset w.r.t the base address
   uint32_t virt_offset = addr - virt_base;
   // Open file and mmap
-  uint32_t mmap_file = open(DEV_MEM_PATH, O_RDWR|O_SYNC);
-
-  return mmap(NULL, (length+virt_offset), PROT_READ|PROT_WRITE, MAP_SHARED, mmap_file, virt_base);
+  uint32_t mmap_file = open(VTA_PYNQ_DEV_MEM_PATH, O_RDWR|O_SYNC);
+  return mmap(NULL,
+              (length+virt_offset),
+              PROT_READ|PROT_WRITE,
+              MAP_SHARED,
+              mmap_file,
+              virt_base);
 }
 
 void VTAUnmapRegister(void *vta, size_t length) {
   // Unmap memory
   int status = munmap(vta, length);
-  assert(status==0);
+  assert(status == 0);
 }
 
 void VTAWriteMappedReg(void* base_addr, uint32_t offset, uint32_t val) {
-  *((volatile uint32_t *) (((char *) base_addr) + offset)) = val;
+  *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset)) = val;
 }
 
 uint32_t VTAReadMappedReg(void* base_addr, uint32_t offset) {
-  return *((volatile uint32_t *) (((char *) base_addr) + offset));
+  return *((volatile uint32_t *) (reinterpret_cast<char *>(base_addr) + offset));
 }
 
 void VTAProgram(const char* bitstream) {
-
     int elem;
     FILE *src, *dst, *partial;
-
-    partial = fopen(BS_IS_PARTIAL, "w");
+    partial = fopen(VTA_PYNQ_BS_IS_PARTIAL, "w");
     if (partial == NULL) {
-        printf("Cannot open partial config file %s\n", BS_IS_PARTIAL);
+        printf("Cannot open partial config file %s\n", VTA_PYNQ_BS_IS_PARTIAL);
         fclose(partial);
         exit(1);
     }
     fputc('0', partial);
     fclose(partial);
-
     src = fopen(bitstream, "rb");
     if (src == NULL) {
         printf("Cannot open bitstream %s\n", bitstream);
         exit(1);
     }
-
-    dst = fopen(BS_XDEVCFG, "wb");
+    dst = fopen(VTA_PYNQ_BS_XDEVCFG, "wb");
     if (dst == NULL) {
-        printf("Cannot open device file %s\n", BS_XDEVCFG);
+        printf("Cannot open device file %s\n", VTA_PYNQ_BS_XDEVCFG);
         fclose(dst);
         exit(1);
     }
-
     elem = fgetc(src);
     while (elem != EOF) {
         fputc(elem, dst);
         elem = fgetc(src);
     }
-
     fclose(src);
     fclose(dst);
-
-}
\ No newline at end of file
+}
diff --git a/vta/src/pynq/pynq_driver.h b/vta/src/pynq/pynq_driver.h
index 9e9482822a212ccb2aa1c1d7acc1f7974bfb8c29..952c4cff8c59ccfe9b93456f311da328ddbd58ff 100644
--- a/vta/src/pynq/pynq_driver.h
+++ b/vta/src/pynq/pynq_driver.h
@@ -4,8 +4,8 @@
  * \brief VTA driver for Pynq board.
  */
 
-#ifndef VTA_PYNQ_DRIVER_H_
-#define VTA_PYNQ_DRIVER_H_
+#ifndef VTA_PYNQ_PYNQ_DRIVER_H_
+#define VTA_PYNQ_PYNQ_DRIVER_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -32,17 +32,20 @@ void xlnkFlushCache(void* buf, int size);
 void xlnkInvalidateCache(void* buf, int size);
 #endif
 
-/*! \brief partial bitstream status file path */
-#define BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
-/*! \brief bitstream destination file path */
-#define BS_XDEVCFG "/dev/xdevcfg"
+/*! \brief (Pynq only) Partial bitstream status file path */
+#define VTA_PYNQ_BS_IS_PARTIAL "/sys/devices/soc0/amba/f8007000.devcfg/is_partial_bitstream"
+/*! \brief (Pynq only) Bitstream destination file path */
+#define VTA_PYNQ_BS_XDEVCFG "/dev/xdevcfg"
 
-/*! \brief Path to /dev/mem */
-#define DEV_MEM_PATH "/dev/mem"
-/*! \brief MMIO driver constant */
-#define MMIO_WORD_LENGTH 4
-/*! \brief MMIO driver constant */
-#define MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
+/*! \brief (Pynq only) Path to /dev/mem */
+#define VTA_PYNQ_DEV_MEM_PATH "/dev/mem"
+/*! \brief (Pynq only) MMIO driver constant */
+#define VTA_PYNQ_MMIO_WORD_LENGTH 4
+/*! \brief (Pynq only) MMIO driver constant */
+#define VTA_PYNQ_MMIO_WORD_MASK (~(MMIO_WORD_LENGTH - 1))
+
+/*! \brief Physically contiguous buffer size limit */
+#define VTA_MAX_XFER (1<<22)
 
 /*! \brief VTA configuration register address range */
 #define VTA_RANGE 0x100
@@ -74,10 +77,7 @@ void xlnkInvalidateCache(void* buf, int size);
 */
 #define VTA_STORE_ADDR    0x43C30000
 
-/*! \brief Buffer size limit */
-#define MAX_XFER (1<<22)
-
 #ifdef __cplusplus
 }
 #endif
-#endif  // VTA_PYNQ_DRIVER_H_
\ No newline at end of file
+#endif  // VTA_PYNQ_PYNQ_DRIVER_H_
\ No newline at end of file
diff --git a/vta/src/runtime.cc b/vta/src/runtime.cc
index 570816e5236a24a7169dc01038b30505dfbd66a9..dde88e8cc82995499a945f31e7f126a5fd4271c8 100644
--- a/vta/src/runtime.cc
+++ b/vta/src/runtime.cc
@@ -4,19 +4,20 @@
  * \brief VTA runtime for PYNQ in C++11
  */
 
+#ifdef VTA_PYNQ_TARGET
+#include "./pynq/pynq_driver.h"
+#endif  // VTA_PYNQ_TARGET
+
+#include <vta/driver.h>
+#include <vta/hw_spec.h>
+#include <vta/runtime.h>
+
 #include <cassert>
 #include <cstring>
 #include <vector>
 #include <thread>
 #include <memory>
 #include <atomic>
-#include <vta/driver.h>
-#include <vta/hw_spec.h>
-#include <vta/runtime.h>
-
-#ifdef PYNQ_TARGET
-#include "./pynq/pynq_driver.h"
-#endif //PYNQ_TARGET
 
 namespace vta {
 
@@ -193,21 +194,21 @@ class UopKernel {
     op.wgt_idx = wgt_index;
     seq_.push_back(op);
     // Ensure that mode is consistent if set
-    if (mode_==0xFFFFFFFF) {
+    if (mode_ == 0xFFFFFFFF) {
       mode_ = mode;
     } else {
-      assert(mode_==mode);
+      assert(mode_ == mode);
     }
     // Check kernel op and imm/imm_val in ALU mode
-    if (mode==1) {
-      if (opcode_==0xFFFFFFFF) {
-        opcode_=opcode;
-        use_imm_=use_imm;
-        imm_val_=imm_val;
+    if (mode == 1) {
+      if (opcode_ == 0xFFFFFFFF) {
+        opcode_ = opcode;
+        use_imm_ = use_imm;
+        imm_val_ = imm_val;
       } else {
-        assert(opcode_==opcode);
-        assert(use_imm_==use_imm);
-        assert(imm_val_==imm_val);
+        assert(opcode_ == opcode);
+        assert(use_imm_ == use_imm);
+        assert(imm_val_ == imm_val);
       }
     }
   }
@@ -222,17 +223,17 @@ class UopKernel {
              seq_[i].src_idx,
              seq_[i].wgt_idx,
              seq_[i].reset_out);
-
     }
     printf("\n");
   }
 
  public:
   // The kernel's mode, opcode, immediate setting and value
-  uint32_t mode_{0xFFFFFFFF}; // UOP type: 0xFFFFFFFF - unset, 0 - GEMM, 1 - ALU
+  uint32_t mode_{0xFFFFFFFF};  // UOP type: 0xFFFFFFFF - unset, 0 - GEMM, 1 - ALU
   uint32_t opcode_{0xFFFFFFFF};
   bool use_imm_{false};
   uint16_t imm_val_{0};
+
  private:
   // Verify that we don't write to the same acc_mem index two cycles in a row
   void VerifyDep(uint32_t dst_index) {
@@ -375,7 +376,7 @@ class UopQueue : public BaseQueue {
     }
     // Simple eviction policy
     uint32_t evict_begin = cache_ptr_;
-    for (;cache_ptr_ < cache_.size(); ++cache_ptr_) {
+    for (; cache_ptr_ < cache_.size(); ++cache_ptr_) {
       if (cache_[cache_ptr_]->sram_begin_ >= sram_end_) break;
       cache_[cache_ptr_]->sram_begin_ = 0;
       cache_[cache_ptr_]->sram_end_ = 0;
@@ -395,7 +396,7 @@ class UopQueue : public BaseQueue {
   void FlushUopLoad(VTAMemInsn* insn) {
     if (sram_begin_ != sram_end_) {
       assert((dram_end_ - dram_begin_) == (sram_end_ - sram_begin_));
-      insn->memory_type = MEM_ID_UOP;
+      insn->memory_type = VTA_MEM_ID_UOP;
       insn->sram_base = sram_begin_;
       insn->dram_base = dram_phy_addr_ / kElemBytes + dram_begin_;
       insn->y_size = 1;
@@ -418,7 +419,7 @@ class UopQueue : public BaseQueue {
   std::vector<UopKernel*> cache_;
   // Constants
   static constexpr int kElemBytes = sizeof(VTAUop);
-  static constexpr int kMaxNumUop = UOP_BUFF_DEPTH;
+  static constexpr int kMaxNumUop = VTA_UOP_BUFF_DEPTH;
   static constexpr int kMaxElems = kMaxBytes / kElemBytes;
 };
 
@@ -541,22 +542,22 @@ class InsnQueue : public BaseQueue {
     for (int i = 1; i < insn_count; ++i) {
       PipelineStage prev = GetPipelineStage(mem_ptr + i - 1);
       PipelineStage now = GetPipelineStage(mem_ptr + i);
-      if (prev==kLoadStage && now==kComputeStage) {
+      if (prev == kLoadStage && now == kComputeStage) {
         mem_ptr[i - 1].push_prev_dep = false;
         mem_ptr[i - 1].push_next_dep = true;
         mem_ptr[i].pop_prev_dep = true;
         mem_ptr[i].pop_next_dep = false;
-      } else if (prev==kComputeStage && now==kLoadStage) {
+      } else if (prev == kComputeStage && now == kLoadStage) {
         mem_ptr[i - 1].push_prev_dep = true;
         mem_ptr[i - 1].push_next_dep = false;
         mem_ptr[i].pop_prev_dep = false;
         mem_ptr[i].pop_next_dep = true;
-      } else if (prev==kStoreStage && now==kComputeStage) {
+      } else if (prev == kStoreStage && now == kComputeStage) {
         mem_ptr[i - 1].push_prev_dep = true;
         mem_ptr[i - 1].push_next_dep = false;
         mem_ptr[i].pop_prev_dep = false;
         mem_ptr[i].pop_next_dep = true;
-      } else if (prev==kComputeStage && now==kStoreStage) {
+      } else if (prev == kComputeStage && now == kStoreStage) {
         mem_ptr[i - 1].push_prev_dep = false;
         mem_ptr[i - 1].push_next_dep = true;
         mem_ptr[i].pop_prev_dep = true;
@@ -573,39 +574,39 @@ class InsnQueue : public BaseQueue {
   // Helper function: Get Opcode string
   const char* getOpcodeString(int opcode, bool use_imm) {
       // The string name
-      if (opcode==ALU_OPCODE_MIN) {
+      if (opcode == VTA_ALU_OPCODE_MIN) {
           if (use_imm) {
               return "min imm";
           } else {
               return "min";
           }
-      } else if (opcode==ALU_OPCODE_MAX) {
+      } else if (opcode == VTA_ALU_OPCODE_MAX) {
           if (use_imm) {
               return "max imm";
           } else {
               return "max";
           }
-      } else if (opcode==ALU_OPCODE_ADD) {
+      } else if (opcode == VTA_ALU_OPCODE_ADD) {
           if (use_imm) {
               return "add imm";
           } else {
               return "add";
           }
-      } else if (opcode==ALU_OPCODE_SUB) {
+      } else if (opcode == VTA_ALU_OPCODE_SUB) {
           if (use_imm) {
               return "sub imm";
           } else {
               return "sub";
           }
-      } else if (opcode==ALU_OPCODE_MUL) {
+      } else if (opcode == VTA_ALU_OPCODE_MUL) {
           if (use_imm) {
               return "mul imm";
           } else {
               return "mul";
           }
-      } else if (opcode==ALU_OPCODE_SHL) {
+      } else if (opcode == VTA_ALU_OPCODE_SHL) {
           return "shl";
-      } else if (opcode==ALU_OPCODE_SHR) {
+      } else if (opcode == VTA_ALU_OPCODE_SHR) {
           return "shr";
       }
 
@@ -629,12 +630,11 @@ class InsnQueue : public BaseQueue {
       // Fetch instruction and decode opcode
       c.generic = insn[i];
       printf("INSTRUCTION %u: ", i);
-      if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) {
+      if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
         if (c.mem.x_size == 0) {
-          if (c.mem.opcode == OPCODE_STORE) {
+          if (c.mem.opcode == VTA_OPCODE_STORE) {
             printf("NOP-STORE-STAGE\n");
-          }
-          else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
+          } else if (GetMemPipelineStage(c.mem.memory_type) == kComputeStage) {
             printf("NOP-COMPUTE-STAGE\n");
           } else {
             printf("NOP-MEMORY-STAGE\n");
@@ -645,15 +645,15 @@ class InsnQueue : public BaseQueue {
                  static_cast<int>(c.mem.push_prev_dep),
                  static_cast<int>(c.mem.push_next_dep));
           // Count status in queues
-          if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) {
-            if (c.mem.opcode == OPCODE_STORE) {
+          if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
+            if (c.mem.opcode == VTA_OPCODE_STORE) {
                 assert(c.mem.pop_next_dep == false);
                 assert(c.mem.push_next_dep == false);
                 if (c.mem.pop_prev_dep) g2s_queue--;
                 if (c.mem.push_prev_dep) s2g_queue++;
-            } else if (c.mem.opcode == OPCODE_LOAD &&
-                       (c.mem.memory_type == MEM_ID_INP ||
-                        c.mem.memory_type == MEM_ID_WGT) ) {
+            } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
+                       (c.mem.memory_type == VTA_MEM_ID_INP ||
+                        c.mem.memory_type == VTA_MEM_ID_WGT) ) {
                 assert(c.mem.pop_prev_dep == false);
                 assert(c.mem.push_prev_dep == false);
                 if (c.mem.pop_next_dep) g2l_queue--;
@@ -664,7 +664,7 @@ class InsnQueue : public BaseQueue {
                 if (c.mem.pop_next_dep) s2g_queue--;
                 if (c.mem.push_next_dep) g2s_queue++;
             }
-          } else if (c.mem.opcode == OPCODE_GEMM) {
+          } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
             // Print instruction field information
             if (c.gemm.pop_prev_dep) l2g_queue--;
             if (c.gemm.push_prev_dep) g2l_queue++;
@@ -676,14 +676,14 @@ class InsnQueue : public BaseQueue {
           continue;
         }
         // Print instruction field information
-        if (c.mem.opcode==OPCODE_LOAD) {
+        if (c.mem.opcode == VTA_OPCODE_LOAD) {
             printf("LOAD ");
-            if (c.mem.memory_type == MEM_ID_UOP) printf("UOP\n");
-            if (c.mem.memory_type == MEM_ID_WGT) printf("WGT\n");
-            if (c.mem.memory_type == MEM_ID_INP) printf("INP\n");
-            if (c.mem.memory_type == MEM_ID_ACC) printf("ACC\n");
+            if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
+            if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
+            if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
+            if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
         }
-        if (c.mem.opcode==OPCODE_STORE) {
+        if (c.mem.opcode == VTA_OPCODE_STORE) {
             printf("STORE\n");
         }
         printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
@@ -703,7 +703,7 @@ class InsnQueue : public BaseQueue {
                static_cast<int>(c.mem.x_stride),
                static_cast<int>(c.mem.x_pad_0),
                static_cast<int>(c.mem.x_pad_1));
-      } else if (c.mem.opcode==OPCODE_GEMM) {
+      } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
         // Print instruction field information
         printf("GEMM\n");
 
@@ -725,7 +725,7 @@ class InsnQueue : public BaseQueue {
                static_cast<int>(c.gemm.wgt_factor_in),
                static_cast<int>(c.gemm.src_factor_in),
                static_cast<int>(c.gemm.dst_factor_in));
-      } else if (c.mem.opcode == OPCODE_ALU) {
+      } else if (c.mem.opcode == VTA_OPCODE_ALU) {
         // Print instruction field information
         printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
         printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
@@ -744,20 +744,20 @@ class InsnQueue : public BaseQueue {
                static_cast<int>(c.alu.iter_in),
                static_cast<int>(c.alu.dst_factor_in),
                static_cast<int>(c.alu.src_factor_in));
-      } else if (c.mem.opcode == OPCODE_FINISH) {
+      } else if (c.mem.opcode == VTA_OPCODE_FINISH) {
         printf("FINISH\n");
       }
 
       // Count status in queues
-      if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) {
-        if (c.mem.opcode == OPCODE_STORE) {
+      if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
+        if (c.mem.opcode == VTA_OPCODE_STORE) {
             assert(c.mem.pop_next_dep == false);
             assert(c.mem.push_next_dep == false);
             if (c.mem.pop_prev_dep) g2s_queue--;
             if (c.mem.push_prev_dep) s2g_queue++;
-        } else if (c.mem.opcode == OPCODE_LOAD &&
-                   (c.mem.memory_type == MEM_ID_INP ||
-                    c.mem.memory_type == MEM_ID_WGT) ) {
+        } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
+                   (c.mem.memory_type == VTA_MEM_ID_INP ||
+                    c.mem.memory_type == VTA_MEM_ID_WGT) ) {
             assert(c.mem.pop_prev_dep == false);
             assert(c.mem.push_prev_dep == false);
             if (c.mem.pop_next_dep) g2l_queue--;
@@ -768,8 +768,8 @@ class InsnQueue : public BaseQueue {
             if (c.mem.pop_next_dep) s2g_queue--;
             if (c.mem.push_next_dep) g2s_queue++;
         }
-      } else if (c.mem.opcode == OPCODE_GEMM ||
-                 c.mem.opcode == OPCODE_ALU) {
+      } else if (c.mem.opcode == VTA_OPCODE_GEMM ||
+                 c.mem.opcode == VTA_OPCODE_ALU) {
         // Print instruction field information
         if (c.gemm.pop_prev_dep) l2g_queue--;
         if (c.gemm.push_prev_dep) g2l_queue++;
@@ -832,23 +832,24 @@ class InsnQueue : public BaseQueue {
   }
   // Get stage of the memory
   static PipelineStage GetMemPipelineStage(int memory_type) {
-    if (memory_type == MEM_ID_ACC) return kComputeStage;
-    if (memory_type == MEM_ID_UOP) return kComputeStage;
+    if (memory_type == VTA_MEM_ID_ACC) return kComputeStage;
+    if (memory_type == VTA_MEM_ID_UOP) return kComputeStage;
     return kLoadStage;
   }
   // Get stage of the computation
   static PipelineStage GetPipelineStage(VTAMemInsn* insn) {
-    if (insn->opcode == OPCODE_GEMM) return kComputeStage;
-    if (insn->opcode == OPCODE_ALU) return kComputeStage;
-    if (insn->opcode == OPCODE_LOAD) {
+    if (insn->opcode == VTA_OPCODE_GEMM) return kComputeStage;
+    if (insn->opcode == VTA_OPCODE_ALU) return kComputeStage;
+    if (insn->opcode == VTA_OPCODE_LOAD) {
       if (insn->x_size == 0) return kNoneStage;
-      if (insn->memory_type == MEM_ID_ACC) return kComputeStage;
-      if (insn->memory_type == MEM_ID_UOP) return kComputeStage;
+      if (insn->memory_type == VTA_MEM_ID_ACC) return kComputeStage;
+      if (insn->memory_type == VTA_MEM_ID_UOP) return kComputeStage;
       return kLoadStage;
     }
-    if (insn->opcode == OPCODE_STORE) {
-      // FIXME: Right now memory_type is a 2-bit field which means that MEM_ID_OUT will appear as 0
-      //        For now we'll refrain from checking the memory_type to avoid an assertion error...
+    if (insn->opcode == VTA_OPCODE_STORE) {
+      // FIXME: Right now memory_type is a 2-bit field which means that
+      //        VTA_MEM_ID_OUT will appear as 0. For now we'll refrain from
+      //        checking the memory_type to avoid an assertion error...
       return kStoreStage;
     }
     assert(false);
@@ -859,7 +860,7 @@ class InsnQueue : public BaseQueue {
                 bool push_prev_dep, bool push_next_dep,
                 bool pop_prev_dep, bool pop_next_dep) {
     VTAMemInsn* insn = reinterpret_cast<VTAMemInsn*>(NextInsn());
-    insn->opcode = (stage==kStoreStage ? OPCODE_STORE : OPCODE_LOAD);
+    insn->opcode = (stage == kStoreStage ? VTA_OPCODE_STORE : VTA_OPCODE_LOAD);
     insn->push_prev_dep = push_prev_dep;
     insn->push_next_dep = push_next_dep;
     insn->pop_prev_dep = pop_prev_dep;
@@ -873,7 +874,7 @@ class InsnQueue : public BaseQueue {
     insn->y_pad_1 = 0;
     insn->x_pad_0 = 0;
     insn->x_pad_1 = 0;
-    insn->memory_type = (stage == kLoadStage ? MEM_ID_INP : MEM_ID_UOP);
+    insn->memory_type = (stage == kLoadStage ? VTA_MEM_ID_INP : VTA_MEM_ID_UOP);
   }
 
  private:
@@ -913,12 +914,12 @@ class CommandQueue {
   }
 
   uint32_t GetElemBytes(uint32_t memory_id) {
-    switch (memory_id){
-      case MEM_ID_UOP: return UOP_ELEM_BYTES;
-      case MEM_ID_INP: return INP_ELEM_BYTES;
-      case MEM_ID_WGT: return WGT_ELEM_BYTES;
-      case MEM_ID_ACC: return ACC_ELEM_BYTES;
-      case MEM_ID_OUT: return INP_ELEM_BYTES;
+    switch (memory_id) {
+      case VTA_MEM_ID_UOP: return VTA_UOP_ELEM_BYTES;
+      case VTA_MEM_ID_INP: return VTA_INP_ELEM_BYTES;
+      case VTA_MEM_ID_WGT: return VTA_WGT_ELEM_BYTES;
+      case VTA_MEM_ID_ACC: return VTA_ACC_ELEM_BYTES;
+      case VTA_MEM_ID_OUT: return VTA_INP_ELEM_BYTES;
       default: break;
     }
     printf("Memory id not recognized: %d\n", memory_id);
@@ -938,7 +939,7 @@ class CommandQueue {
                     uint32_t dst_sram_index,
                     uint32_t dst_memory_type) {
     VTAMemInsn* insn = insn_queue_.CreateMemInsn(dst_memory_type);
-    insn->opcode = OPCODE_LOAD;
+    insn->opcode = VTA_OPCODE_LOAD;
     insn->memory_type = dst_memory_type;
     insn->sram_base = dst_sram_index;
     DataBuffer* src = DataBuffer::FromHandle(src_dram_addr);
@@ -961,7 +962,7 @@ class CommandQueue {
                      uint32_t y_size,
                      uint32_t x_stride) {
     VTAMemInsn* insn = insn_queue_.CreateStoreInsn();
-    insn->opcode = OPCODE_STORE;
+    insn->opcode = VTA_OPCODE_STORE;
     insn->memory_type = src_memory_type;
     insn->sram_base = src_sram_index;
     DataBuffer* dst = DataBuffer::FromHandle(dst_dram_addr);
@@ -1013,7 +1014,7 @@ class CommandQueue {
     insn_queue_.CommitPendingPop(kComputeStage);
     // NOTE: FINISH cannot contain pop
     VTAGemInsn* insn = insn_queue_.CreateGemInsn();
-    insn->opcode = OPCODE_FINISH;
+    insn->opcode = VTA_OPCODE_FINISH;
     assert(!insn_queue_.PendingPop());
     // Check if there are no instruction to execute at all
     if (insn_queue_.count() == 0) return;
@@ -1026,11 +1027,11 @@ class CommandQueue {
     }
     // Make sure that the last instruction is a finish instruction
     assert(reinterpret_cast<VTAMemInsn*>(
-        insn_queue_.data())[insn_queue_.count()-1].opcode == OPCODE_FINISH);
+        insn_queue_.data())[insn_queue_.count()-1].opcode == VTA_OPCODE_FINISH);
 
-#ifdef PYNQ_TARGET
+#ifdef VTA_PYNQ_TARGET
     // Make sure that we don't exceed contiguous physical memory limits
-    assert(insn_queue_.count() < MAX_XFER);
+    assert(insn_queue_.count() < VTA_MAX_XFER);
 
     // NOTE: Register address map is derived from the auto-generated
     // driver files available under hardware/build/vivado/<design>/export/driver
@@ -1064,7 +1065,7 @@ class CommandQueue {
     }
     // Report error if timeout
     assert(t < wait_cycles);
-#endif //PYNQ_TARGET
+#endif  // VTA_PYNQ_TARGET
 
     // Reset buffers
     uop_queue_.Reset();
@@ -1142,12 +1143,12 @@ class CommandQueue {
     uop_queue_.Push(kernel,
                     [this]() { this->AutoSync(); });
     if (uop_queue_.pending()) {
-      VTAMemInsn* insn = insn_queue_.CreateMemInsn(MEM_ID_UOP);
-      insn->opcode = OPCODE_LOAD;
+      VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP);
+      insn->opcode = VTA_OPCODE_LOAD;
       uop_queue_.FlushUopLoad(insn);
     }
     VTAGemInsn* insn = insn_queue_.CreateGemInsn();
-    insn->opcode = OPCODE_GEMM;
+    insn->opcode = VTA_OPCODE_GEMM;
     insn->uop_bgn = kernel->sram_begin_;
     insn->uop_end = kernel->sram_end_;
     const std::vector<UopKernel::LoopEntry> &loop = kernel->loop();
@@ -1180,12 +1181,12 @@ class CommandQueue {
     uop_queue_.Push(kernel,
                     [this]() { this->AutoSync(); });
     if (uop_queue_.pending()) {
-      VTAMemInsn* insn = insn_queue_.CreateMemInsn(MEM_ID_UOP);
-      insn->opcode = OPCODE_LOAD;
+      VTAMemInsn* insn = insn_queue_.CreateMemInsn(VTA_MEM_ID_UOP);
+      insn->opcode = VTA_OPCODE_LOAD;
       uop_queue_.FlushUopLoad(insn);
     }
     VTAAluInsn* insn = insn_queue_.CreateAluInsn();
-    insn->opcode = OPCODE_ALU;
+    insn->opcode = VTA_OPCODE_ALU;
     insn->uop_bgn = kernel->sram_begin_;
     insn->uop_end = kernel->sram_end_;
     insn->alu_opcode = kernel->opcode_;
@@ -1219,7 +1220,7 @@ class CommandQueue {
   void CheckInsnOverFlow() {
     // At each API call, we can at most commit:
     // one pending store, one pending load, and one uop
-    if (insn_queue_.count() >= MAX_XFER) {
+    if (insn_queue_.count() >= VTA_MAX_XFER) {
       this->AutoSync();
     }
   }
@@ -1237,9 +1238,9 @@ class CommandQueue {
   // The kernel we currently recording
   UopKernel* record_kernel_{nullptr};
   // Micro op queue
-  UopQueue<MAX_XFER, true, true> uop_queue_;
+  UopQueue<VTA_MAX_XFER, true, true> uop_queue_;
   // instruction queue
-  InsnQueue<MAX_XFER, true, true> insn_queue_;
+  InsnQueue<VTA_MAX_XFER, true, true> insn_queue_;
 };
 
 }  // namespace vta
@@ -1342,10 +1343,10 @@ void VTAStoreBuffer2D(VTACommandHandle cmd,
                       uint32_t x_size,
                       uint32_t y_size,
                       uint32_t x_stride) {
- static_cast<vta::CommandQueue*>(cmd)->
-     StoreBuffer2D(src_sram_index, src_memory_type,
-                   dst_dram_addr, dst_elem_offset,
-                   x_size, y_size, x_stride);
+  static_cast<vta::CommandQueue*>(cmd)->
+      StoreBuffer2D(src_sram_index, src_memory_type,
+                    dst_dram_addr, dst_elem_offset,
+                    x_size, y_size, x_stride);
 }
 
 void VTAUopPush(uint32_t mode,
diff --git a/vta/src/tvm/vta_device_api.cc b/vta/src/tvm/vta_device_api.cc
index b686b65fc4156bb5a2ed19c42cc17a2735557b8d..b7b57e199f3fa47a2d576027fbf7efb983eb03f8 100644
--- a/vta/src/tvm/vta_device_api.cc
+++ b/vta/src/tvm/vta_device_api.cc
@@ -1,8 +1,14 @@
-// simply include the driver for now.
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file vta_device_api.cc
+ * \brief VTA device API for TVM
+ */
+
 #include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
 #include <vta/runtime.h>
-#include "../../tvm/src/runtime/workspace_pool.h"
+
+#include "../../nnvm/tvm/src/runtime/workspace_pool.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/vta/tests/hardware/common/test_lib.cc b/vta/tests/hardware/common/test_lib.cc
index d203b2aa1307534f02642e48ed76a0651a4ce82a..7f46a43b18675e9ff92c8390a435f65e71b32160 100644
--- a/vta/tests/hardware/common/test_lib.cc
+++ b/vta/tests/hardware/common/test_lib.cc
@@ -6,41 +6,43 @@
 
 #include "./test_lib.h"
 
+uint32_t globalSeed;
+
 const char* getOpcodeString(int opcode, bool use_imm) {
   // Returns string name
-  if (opcode == ALU_OPCODE_MIN) {
+  if (opcode == VTA_ALU_OPCODE_MIN) {
     if (use_imm) {
       return "min imm";
     } else {
       return "min";
     }
-  } else if (opcode == ALU_OPCODE_MAX) {
+  } else if (opcode == VTA_ALU_OPCODE_MAX) {
     if (use_imm) {
       return "max imm";
     } else {
       return "max";
     }
-  } else if (opcode == ALU_OPCODE_ADD) {
+  } else if (opcode == VTA_ALU_OPCODE_ADD) {
     if (use_imm) {
       return "add imm";
     } else {
       return "add";
     }
-  } else if (opcode == ALU_OPCODE_SUB) {
+  } else if (opcode == VTA_ALU_OPCODE_SUB) {
     if (use_imm) {
       return "sub imm";
     } else {
       return "sub";
     }
-  } else if (opcode == ALU_OPCODE_MUL) {
+  } else if (opcode == VTA_ALU_OPCODE_MUL) {
     if (use_imm) {
       return "mul imm";
     } else {
       return "mul";
     }
-  } else if (opcode == ALU_OPCODE_SHL) {
+  } else if (opcode == VTA_ALU_OPCODE_SHL) {
     return "shl";
-  } else if (opcode == ALU_OPCODE_SHR) {
+  } else if (opcode == VTA_ALU_OPCODE_SHR) {
     return "shr";
   }
   return "unknown op";
@@ -49,20 +51,20 @@ const char* getOpcodeString(int opcode, bool use_imm) {
 template <typename T, int T_WIDTH>
 void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_block) {
   int buffer_idx = 0;
-  for(int i = 0; i < y_size / y_block; i ++) {
-    for(int j = 0; j < x_size / x_block; j ++) {
-      for(int k = 0; k < y_block; k ++) {
+  for (int i = 0; i < y_size / y_block; i++) {
+    for (int j = 0; j < x_size / x_block; j++) {
+      for (int k = 0; k < y_block; k++) {
         if (T_WIDTH < 8) {
           for (int l = 0; l < x_block; l += 8 / T_WIDTH) {
             dst[buffer_idx] = 0;
-            for (int m = 0; m < 8 / T_WIDTH; m ++) {
+            for (int m = 0; m < 8 / T_WIDTH; m++) {
               dst[buffer_idx] |= (src[i * y_block + k][j * x_block + l + m] &
                 ((1ULL << T_WIDTH) - 1)) << (m * T_WIDTH);
             }
-            buffer_idx ++;
+            buffer_idx++;
           }
         } else {
-          for (int l = 0; l < x_block; l ++) {
+          for (int l = 0; l < x_block; l++) {
             dst[buffer_idx++] = src[i * y_block + k][j * x_block + l];
           }
         }
@@ -74,20 +76,20 @@ void packBuffer(T *dst, T **src, int y_size, int x_size, int y_block, int x_bloc
 template <typename T, int T_WIDTH>
 void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_block) {
   int buffer_idx = 0;
-  for(int i = 0; i < y_size / y_block; i ++) {
-    for(int j = 0; j < x_size / x_block; j ++) {
-      for(int k = 0; k < y_block; k ++) {
+  for (int i = 0; i < y_size / y_block; i++) {
+    for (int j = 0; j < x_size / x_block; j++) {
+      for (int k = 0; k < y_block; k++) {
         if (T_WIDTH < 8) {
           for (int l = 0; l < x_block; l += 8 / T_WIDTH) {
-            for (int m = 0; m < 8 / T_WIDTH; m ++) {
+            for (int m = 0; m < 8 / T_WIDTH; m++) {
               dst[i * y_block + k][j * x_block + l + m] = (src[buffer_idx] >> (m * T_WIDTH))
                 & ((1 << T_WIDTH) - 1);
             }
-            buffer_idx ++;
+            buffer_idx++;
           }
         } else {
-          for (int l = 0; l < x_block; l ++) {
-            dst[i * y_block + k][j * x_block + l] = src[buffer_idx ++];
+          for (int l = 0; l < x_block; l++) {
+            dst[i * y_block + k][j * x_block + l] = src[buffer_idx++];
           }
         }
       }
@@ -98,14 +100,15 @@ void unpackBuffer(T **dst, T *src, int y_size, int x_size, int y_block, int x_bl
 template <typename T, int T_WIDTH>
 T ** allocInit2dArray(int rows, int cols) {
   // Allocate
-  T **array = (T **) malloc(sizeof(T *) * rows);
-  for (int i = 0; i < rows; i ++) {
-    array[i] = (T *) malloc(sizeof(T) * cols);
+  T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
+  for (int i = 0; i < rows; i++) {
+    array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
   }
   // Init
-  for (int i = 0; i < rows; i ++) {
-    for (int j = 0; j < cols; j ++) {
-      array[i][j] = (T) (rand() % (1LL << (T_WIDTH - 1)) - (1LL << (T_WIDTH - 2)));
+  for (int i = 0; i < rows; i++) {
+    for (int j = 0; j < cols; j++) {
+      array[i][j] =
+          static_cast<T>(rand_r(&globalSeed) % (1LL << (T_WIDTH - 1)) - (1LL << (T_WIDTH - 2)));
     }
   }
   return array;
@@ -113,16 +116,16 @@ T ** allocInit2dArray(int rows, int cols) {
 
 template <typename T>
 T ** alloc2dArray(int rows, int cols) {
-  T **array = (T **) malloc(sizeof(T *) * rows);
-  for (int i = 0; i < rows; i ++) {
-    array[i] = (T *) malloc(sizeof(T) * cols);
+  T **array = static_cast<T **>(malloc(sizeof(T *) * rows));
+  for (int i = 0; i < rows; i++) {
+    array[i] = static_cast<T *>(malloc(sizeof(T) * cols));
   }
   return array;
 }
 
 template <typename T>
 void free2dArray(T **array, int rows, int cols) {
-  for (int i = 0; i < rows; i ++) {
+  for (int i = 0; i < rows; i++) {
     free(array[i]);
   }
   free(array);
@@ -130,11 +133,11 @@ void free2dArray(T **array, int rows, int cols) {
 
 template <typename T>
 T *** alloc3dArray(int rows, int cols, int depth) {
-  T ***array = (T ***) malloc(sizeof(T **) * rows);
-  for (int i = 0; i < rows; i ++) {
-    array[i] = (T **) malloc(sizeof(T *) * cols);
-    for (int j = 0; j < cols; j ++) {
-      array[i][j] = (T*) malloc(sizeof(T) * depth);
+  T ***array = static_cast<T ***>(malloc(sizeof(T **) * rows));
+  for (int i = 0; i < rows; i++) {
+    array[i] = static_cast<T **>(malloc(sizeof(T *) * cols));
+    for (int j = 0; j < cols; j++) {
+      array[i][j] = static_cast<T*>(malloc(sizeof(T) * depth));
     }
   }
   return array;
@@ -142,8 +145,8 @@ T *** alloc3dArray(int rows, int cols, int depth) {
 
 template <typename T>
 void free3dArray(T *** array, int rows, int cols, int depth) {
-  for (int i = 0; i < rows; i ++) {
-    for (int j = 0; j < cols; j ++) {
+  for (int i = 0; i < rows; i++) {
+    for (int j = 0; j < cols; j++) {
       free(array[i][j]);
     }
     free(array[i]);
@@ -153,7 +156,7 @@ void free3dArray(T *** array, int rows, int cols, int depth) {
 
 void * allocBuffer(size_t num_bytes) {
 #ifdef NO_SIM
-  return VTAMemAlloc(num_bytes, CACHED);
+  return VTAMemAlloc(num_bytes, VTA_CACHED);
 #else
   return malloc(num_bytes);
 #endif
@@ -173,7 +176,7 @@ VTAGenericInsn reset2DInsn(int type, int sram_offset, int y_size, int x_size, in
   union VTAInsn converter;
   // Memory instruction initialization
   VTAMemInsn insn = {};
-  insn.opcode = OPCODE_LOAD;
+  insn.opcode = VTA_OPCODE_LOAD;
   insn.pop_prev_dep = pop_prev_dep;
   insn.pop_next_dep = pop_next_dep;
   insn.push_prev_dep = push_prev_dep;
@@ -250,7 +253,7 @@ VTAGenericInsn getGEMMInsn(int uop_offset, int batch, int in_feat, int out_feat,
   union VTAInsn converter;
   // GEVM instruction initialization
   VTAGemInsn insn;
-  insn.opcode = OPCODE_GEMM;
+  insn.opcode = VTA_OPCODE_GEMM;
   insn.pop_prev_dep = pop_prev_dep;
   insn.pop_next_dep = pop_next_dep;
   insn.push_prev_dep = push_prev_dep;
@@ -288,7 +291,7 @@ VTAGenericInsn getALUInsn(int opcode, int vector_size, bool use_imm, int imm, bo
   union VTAInsn converter;
   // Memory instruction initialization
   VTAAluInsn insn = {};
-  insn.opcode = OPCODE_ALU;
+  insn.opcode = VTA_OPCODE_ALU;
   insn.pop_prev_dep = pop_prev_dep;
   insn.pop_next_dep = pop_next_dep;
   insn.push_prev_dep = push_prev_dep;
@@ -327,7 +330,7 @@ VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next) {
   union VTAInsn converter;
   // GEVM instruction initialization
   VTAGemInsn insn;
-  insn.opcode = OPCODE_FINISH;
+  insn.opcode = VTA_OPCODE_FINISH;
   insn.pop_prev_dep = pop_prev;
   insn.pop_next_dep = pop_next;
   insn.push_prev_dep = 0;
@@ -347,21 +350,20 @@ VTAGenericInsn getFinishInsn(bool pop_prev, bool pop_next) {
 }
 
 VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) {
-
   // Derive the total uop size
   int uop_size = (uop_compression) ? 1 : y_size * x_size;
 
   // Allocate buffer
 #ifdef NO_SIM
-  VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
+  VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
 #else
-  VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
+  VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
 #endif
 
   if (!uop_compression) {
     int uop_idx = 0;
-    for (int i = 0; i < y_size; i ++) {
-      for (int j = 0; j < x_size; j ++) {
+    for (int i = 0; i < y_size; i++) {
+      for (int j = 0; j < x_size; j++) {
         uop_buf[uop_idx].reset_out = false;
         uop_buf[uop_idx].dst_idx = i * x_size + j;
         uop_buf[uop_idx].src_idx = 0;
@@ -381,23 +383,22 @@ VTAUop * getCopyUops(int y_size, int x_size, int uop_compression) {
 
 VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
     bool multi_threaded) {
-
   // Derive the total uop size
   int uop_size = (uop_compression) ? batch : batch * in_feat * out_feat;
   if (multi_threaded) uop_size *= 2;
 
   // Allocate buffer
 #ifdef NO_SIM
-  VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
+  VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
 #else
-  VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
+  VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
 #endif
 
   if (!uop_compression) {
     int uop_idx = 0;
-    for (int i = 0; i < batch; i ++) {
-      for (int j = 0; j < in_feat; j ++) {
-        for (int k = 0; k < out_feat; k ++) {
+    for (int i = 0; i < batch; i++) {
+      for (int j = 0; j < in_feat; j++) {
+        for (int k = 0; k < out_feat; k++) {
           uop_buf[uop_idx].reset_out = false;
           uop_buf[uop_idx].dst_idx = i * out_feat + k;
           uop_buf[uop_idx].src_idx = i * in_feat + j;
@@ -407,7 +408,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
       }
     }
   } else {
-    for (int i = 0; i < batch; i ++) {
+    for (int i = 0; i < batch; i++) {
       uop_buf[i].reset_out = false;
       uop_buf[i].dst_idx = i * out_feat;
       uop_buf[i].src_idx = i * in_feat;
@@ -418,9 +419,9 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
   if (multi_threaded) {
     if (!uop_compression) {
       int uop_idx = uop_size / 2;
-      for (int i = 0; i < batch; i ++) {
-        for (int j = 0; j < in_feat; j ++) {
-          for (int k = 0; k < out_feat; k ++) {
+      for (int i = 0; i < batch; i++) {
+        for (int j = 0; j < in_feat; j++) {
+          for (int k = 0; k < out_feat; k++) {
             uop_buf[uop_idx].reset_out = false;
             uop_buf[uop_idx].dst_idx = i * out_feat + k;
             uop_buf[uop_idx].src_idx = batch * in_feat + i * in_feat + j;
@@ -430,7 +431,7 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
         }
       }
     } else {
-      for (int i = 0; i < batch; i ++) {
+      for (int i = 0; i < batch; i++) {
         uop_buf[batch+i].reset_out = false;
         uop_buf[batch+i].dst_idx = i * out_feat;
         uop_buf[batch+i].src_idx = batch * in_feat + i * in_feat;
@@ -443,19 +444,18 @@ VTAUop * getGEMMUops(int batch, int in_feat, int out_feat, bool uop_compression,
 }
 
 VTAUop * getMapALUUops(int vector_size, bool uop_compression) {
-
   // Derive the total uop size
   int uop_size = (uop_compression) ? 1 : vector_size;
 
   // Allocate buffer
 #ifdef NO_SIM
-  VTAUop *uop_buf = (VTAUop *) VTAMemAlloc(sizeof(VTAUop) * uop_size, CACHED);
+  VTAUop *uop_buf = static_cast<VTAUop *>(VTAMemAlloc(sizeof(VTAUop) * uop_size, VTA_CACHED));
 #else
-  VTAUop *uop_buf = (VTAUop *) malloc(sizeof(VTAUop) * uop_size);
+  VTAUop *uop_buf = static_cast<VTAUop *>(malloc(sizeof(VTAUop) * uop_size));
 #endif
 
   if (!uop_compression) {
-    for (int i = 0; i < vector_size; i ++) {
+    for (int i = 0; i < vector_size; i++) {
       uop_buf[i].reset_out = 0;
       uop_buf[i].dst_idx = i;
       uop_buf[i].src_idx = vector_size + i;
@@ -473,65 +473,65 @@ void printParameters() {
   // Some debugging code
   printf("Size of VTAInsn: %d\n", sizeof(VTAGenericInsn));
   printf("Size of VTAUop: %d\n", sizeof(VTAUop));
-  printf("UOP_BUFF_DEPTH: %d\n", UOP_BUFF_DEPTH);
-  printf("LOG_UOP_BUFF_DEPTH: %d\n", LOG_UOP_BUFF_DEPTH);
-  printf("WGT_BUFF_DEPTH: %d\n", WGT_BUFF_DEPTH);
-  printf("LOG_WGT_BUFF_DEPTH: %d\n", LOG_WGT_BUFF_DEPTH);
-  printf("INP_BUFF_DEPTH: %d\n", INP_BUFF_DEPTH);
-  printf("LOG_INP_BUFF_DEPTH: %d\n", LOG_INP_BUFF_DEPTH);
-  printf("ACC_BUFF_DEPTH: %d\n", ACC_BUFF_DEPTH);
-  printf("LOG_ACC_BUFF_DEPTH: %d\n", LOG_ACC_BUFF_DEPTH);
-  printf("WGT_WORDS: %d\n", WGT_BUFF_DEPTH*BLOCK_IN*BLOCK_OUT);
-  printf("INP_WORDS: %d\n", INP_BUFF_DEPTH*BLOCK_IN);
-  printf("ACC_WORDS: %d\n", ACC_BUFF_DEPTH*BLOCK_OUT);
-  printf("INS_ELEM_BYTES: %d\n", INS_ELEM_BYTES);
-  printf("UOP_ELEM_BYTES: %d\n", UOP_ELEM_BYTES);
-  printf("INP_ELEM_BYTES: %d\n", INP_ELEM_BYTES);
-  printf("WGT_ELEM_BYTES: %d\n", WGT_ELEM_BYTES);
-  printf("ACC_ELEM_BYTES: %d\n", ACC_ELEM_BYTES);
-  printf("BLOCK_IN: %d\n", BLOCK_IN);
-  printf("BLOCK_OUT: %d\n", BLOCK_OUT);
-  printf("INSN_MEM_0 [%d-%d]\n", INSN_MEM_0_0, INSN_MEM_0_1);
-  printf("INSN_MEM_1 [%d]\n", INSN_MEM_1);
-  printf("INSN_MEM_2 [%d]\n", INSN_MEM_2);
-  printf("INSN_MEM_3 [%d]\n", INSN_MEM_3);
-  printf("INSN_MEM_4 [%d]\n", INSN_MEM_4);
-  printf("INSN_MEM_5 [%d-%d]\n", INSN_MEM_5_0, INSN_MEM_5_1);
-  printf("INSN_MEM_6 [%d-%d]\n", INSN_MEM_6_0, INSN_MEM_6_1);
-  printf("INSN_MEM_7 [%d-%d]\n", INSN_MEM_7_0, INSN_MEM_7_1);
-  printf("INSN_MEM_8 [%d-%d]\n", INSN_MEM_8_0, INSN_MEM_8_1);
-  printf("INSN_MEM_9 [%d-%d]\n", INSN_MEM_9_0, INSN_MEM_9_1);
-  printf("INSN_MEM_A [%d-%d]\n", INSN_MEM_A_0, INSN_MEM_A_1);
-  printf("INSN_MEM_B [%d-%d]\n", INSN_MEM_B_0, INSN_MEM_B_1);
-  printf("INSN_MEM_C [%d-%d]\n", INSN_MEM_C_0, INSN_MEM_C_1);
-  printf("INSN_MEM_D [%d-%d]\n", INSN_MEM_D_0, INSN_MEM_D_1);
-  printf("INSN_MEM_E [%d-%d]\n", INSN_MEM_E_0, INSN_MEM_E_1);
-  printf("INSN_GEM_0 [%d-%d]\n", INSN_GEM_0_0, INSN_GEM_0_1);
-  printf("INSN_GEM_1 [%d]\n", INSN_GEM_1);
-  printf("INSN_GEM_2 [%d]\n", INSN_GEM_2);
-  printf("INSN_GEM_3 [%d]\n", INSN_GEM_3);
-  printf("INSN_GEM_4 [%d]\n", INSN_GEM_4);
-  printf("INSN_GEM_5 [%d-%d]\n", INSN_GEM_5_0, INSN_GEM_5_1);
-  printf("INSN_GEM_6 [%d-%d]\n", INSN_GEM_6_0, INSN_GEM_6_1);
-  printf("INSN_GEM_7 [%d-%d]\n", INSN_GEM_7_0, INSN_GEM_7_1);
-  printf("INSN_GEM_8 [%d-%d]\n", INSN_GEM_8_0, INSN_GEM_8_1);
-  printf("INSN_GEM_9 [%d-%d]\n", INSN_GEM_9_0, INSN_GEM_9_1);
-  printf("INSN_GEM_A [%d-%d]\n", INSN_GEM_A_0, INSN_GEM_A_1);
-  printf("INSN_GEM_B [%d-%d]\n", INSN_GEM_B_0, INSN_GEM_B_1);
-  printf("INSN_GEM_C [%d-%d]\n", INSN_GEM_C_0, INSN_GEM_C_1);
-  printf("INSN_GEM_D [%d-%d]\n", INSN_GEM_D_0, INSN_GEM_D_1);
-  printf("INSN_GEM_E [%d-%d]\n", INSN_GEM_E_0, INSN_GEM_E_1);
-  printf("INSN_ALU_D [%d-%d]\n", INSN_ALU_D_0, INSN_ALU_D_1);
-  printf("INSN_ALU_E [%d]\n", INSN_ALU_E);
-  printf("INSN_ALU_F [%d-%d]\n", INSN_ALU_F_0, INSN_ALU_F_1);
-  printf("UOP_GEM_0 [%d]\n", UOP_GEM_0);
-  printf("UOP_GEM_1 [%d-%d]\n", UOP_GEM_1_0, UOP_GEM_1_1);
-  printf("UOP_GEM_2 [%d-%d]\n", UOP_GEM_2_0, UOP_GEM_2_1);
-  printf("UOP_GEM_3 [%d-%d]\n", UOP_GEM_3_0, UOP_GEM_3_1);
-  printf("UOP_ALU_0 [%d]\n", UOP_ALU_0);
-  printf("UOP_ALU_1 [%d-%d]\n", UOP_ALU_1_0, UOP_ALU_1_1);
-  printf("UOP_ALU_2 [%d-%d]\n", UOP_ALU_2_0, UOP_ALU_2_1);
-  printf("UOP_ALU_3 [%d-%d]\n", UOP_ALU_3_0, UOP_ALU_3_1);
+  printf("VTA_UOP_BUFF_DEPTH: %d\n", VTA_UOP_BUFF_DEPTH);
+  printf("VTA_LOG_UOP_BUFF_DEPTH: %d\n", VTA_LOG_UOP_BUFF_DEPTH);
+  printf("VTA_WGT_BUFF_DEPTH: %d\n", VTA_WGT_BUFF_DEPTH);
+  printf("VTA_LOG_WGT_BUFF_DEPTH: %d\n", VTA_LOG_WGT_BUFF_DEPTH);
+  printf("VTA_INP_BUFF_DEPTH: %d\n", VTA_INP_BUFF_DEPTH);
+  printf("VTA_LOG_INP_BUFF_DEPTH: %d\n", VTA_LOG_INP_BUFF_DEPTH);
+  printf("VTA_ACC_BUFF_DEPTH: %d\n", VTA_ACC_BUFF_DEPTH);
+  printf("VTA_LOG_ACC_BUFF_DEPTH: %d\n", VTA_LOG_ACC_BUFF_DEPTH);
+  printf("VTA_WGT_WORDS: %d\n", VTA_WGT_BUFF_DEPTH*VTA_BLOCK_IN*VTA_BLOCK_OUT);
+  printf("VTA_INP_WORDS: %d\n", VTA_INP_BUFF_DEPTH*VTA_BLOCK_IN);
+  printf("VTA_ACC_WORDS: %d\n", VTA_ACC_BUFF_DEPTH*VTA_BLOCK_OUT);
+  printf("VTA_INS_ELEM_BYTES: %d\n", VTA_INS_ELEM_BYTES);
+  printf("VTA_UOP_ELEM_BYTES: %d\n", VTA_UOP_ELEM_BYTES);
+  printf("VTA_INP_ELEM_BYTES: %d\n", VTA_INP_ELEM_BYTES);
+  printf("VTA_WGT_ELEM_BYTES: %d\n", VTA_WGT_ELEM_BYTES);
+  printf("VTA_ACC_ELEM_BYTES: %d\n", VTA_ACC_ELEM_BYTES);
+  printf("VTA_BLOCK_IN: %d\n", VTA_BLOCK_IN);
+  printf("VTA_BLOCK_OUT: %d\n", VTA_BLOCK_OUT);
+  printf("VTA_INSN_MEM_0 [%d-%d]\n", VTA_INSN_MEM_0_0, VTA_INSN_MEM_0_1);
+  printf("VTA_INSN_MEM_1 [%d]\n", VTA_INSN_MEM_1);
+  printf("VTA_INSN_MEM_2 [%d]\n", VTA_INSN_MEM_2);
+  printf("VTA_INSN_MEM_3 [%d]\n", VTA_INSN_MEM_3);
+  printf("VTA_INSN_MEM_4 [%d]\n", VTA_INSN_MEM_4);
+  printf("VTA_INSN_MEM_5 [%d-%d]\n", VTA_INSN_MEM_5_0, VTA_INSN_MEM_5_1);
+  printf("VTA_INSN_MEM_6 [%d-%d]\n", VTA_INSN_MEM_6_0, VTA_INSN_MEM_6_1);
+  printf("VTA_INSN_MEM_7 [%d-%d]\n", VTA_INSN_MEM_7_0, VTA_INSN_MEM_7_1);
+  printf("VTA_INSN_MEM_8 [%d-%d]\n", VTA_INSN_MEM_8_0, VTA_INSN_MEM_8_1);
+  printf("VTA_INSN_MEM_9 [%d-%d]\n", VTA_INSN_MEM_9_0, VTA_INSN_MEM_9_1);
+  printf("VTA_INSN_MEM_A [%d-%d]\n", VTA_INSN_MEM_A_0, VTA_INSN_MEM_A_1);
+  printf("VTA_INSN_MEM_B [%d-%d]\n", VTA_INSN_MEM_B_0, VTA_INSN_MEM_B_1);
+  printf("VTA_INSN_MEM_C [%d-%d]\n", VTA_INSN_MEM_C_0, VTA_INSN_MEM_C_1);
+  printf("VTA_INSN_MEM_D [%d-%d]\n", VTA_INSN_MEM_D_0, VTA_INSN_MEM_D_1);
+  printf("VTA_INSN_MEM_E [%d-%d]\n", VTA_INSN_MEM_E_0, VTA_INSN_MEM_E_1);
+  printf("VTA_INSN_GEM_0 [%d-%d]\n", VTA_INSN_GEM_0_0, VTA_INSN_GEM_0_1);
+  printf("VTA_INSN_GEM_1 [%d]\n", VTA_INSN_GEM_1);
+  printf("VTA_INSN_GEM_2 [%d]\n", VTA_INSN_GEM_2);
+  printf("VTA_INSN_GEM_3 [%d]\n", VTA_INSN_GEM_3);
+  printf("VTA_INSN_GEM_4 [%d]\n", VTA_INSN_GEM_4);
+  printf("VTA_INSN_GEM_5 [%d-%d]\n", VTA_INSN_GEM_5_0, VTA_INSN_GEM_5_1);
+  printf("VTA_INSN_GEM_6 [%d-%d]\n", VTA_INSN_GEM_6_0, VTA_INSN_GEM_6_1);
+  printf("VTA_INSN_GEM_7 [%d-%d]\n", VTA_INSN_GEM_7_0, VTA_INSN_GEM_7_1);
+  printf("VTA_INSN_GEM_8 [%d-%d]\n", VTA_INSN_GEM_8_0, VTA_INSN_GEM_8_1);
+  printf("VTA_INSN_GEM_9 [%d-%d]\n", VTA_INSN_GEM_9_0, VTA_INSN_GEM_9_1);
+  printf("VTA_INSN_GEM_A [%d-%d]\n", VTA_INSN_GEM_A_0, VTA_INSN_GEM_A_1);
+  printf("VTA_INSN_GEM_B [%d-%d]\n", VTA_INSN_GEM_B_0, VTA_INSN_GEM_B_1);
+  printf("VTA_INSN_GEM_C [%d-%d]\n", VTA_INSN_GEM_C_0, VTA_INSN_GEM_C_1);
+  printf("VTA_INSN_GEM_D [%d-%d]\n", VTA_INSN_GEM_D_0, VTA_INSN_GEM_D_1);
+  printf("VTA_INSN_GEM_E [%d-%d]\n", VTA_INSN_GEM_E_0, VTA_INSN_GEM_E_1);
+  printf("VTA_INSN_ALU_D [%d-%d]\n", VTA_INSN_ALU_D_0, VTA_INSN_ALU_D_1);
+  printf("VTA_INSN_ALU_E [%d]\n", VTA_INSN_ALU_E);
+  printf("VTA_INSN_ALU_F [%d-%d]\n", VTA_INSN_ALU_F_0, VTA_INSN_ALU_F_1);
+  printf("VTA_UOP_GEM_0 [%d]\n", VTA_UOP_GEM_0);
+  printf("VTA_UOP_GEM_1 [%d-%d]\n", VTA_UOP_GEM_1_0, VTA_UOP_GEM_1_1);
+  printf("VTA_UOP_GEM_2 [%d-%d]\n", VTA_UOP_GEM_2_0, VTA_UOP_GEM_2_1);
+  printf("VTA_UOP_GEM_3 [%d-%d]\n", VTA_UOP_GEM_3_0, VTA_UOP_GEM_3_1);
+  printf("VTA_UOP_ALU_0 [%d]\n", VTA_UOP_ALU_0);
+  printf("VTA_UOP_ALU_1 [%d-%d]\n", VTA_UOP_ALU_1_0, VTA_UOP_ALU_1_1);
+  printf("VTA_UOP_ALU_2 [%d-%d]\n", VTA_UOP_ALU_2_0, VTA_UOP_ALU_2_1);
+  printf("VTA_UOP_ALU_3 [%d-%d]\n", VTA_UOP_ALU_3_0, VTA_UOP_ALU_3_1);
 }
 
 void printInstruction(int num_insn, VTAGenericInsn *insns) {
@@ -544,84 +544,111 @@ void printInstruction(int num_insn, VTAGenericInsn *insns) {
   union VTAInsn c;
   // Iterate over all instructions
   printf("DEBUG - There are %u instructions\n", num_insn);
-  for (int i = 0; i < num_insn; i ++) {
+  for (int i = 0; i < num_insn; i++) {
     // Fetch instruction and decode opcode
     c.generic = insns[i];
     printf("DEBUG - INSTRUCTION %u: ", i);
-    if (c.mem.opcode == OPCODE_LOAD || c.mem.opcode == OPCODE_STORE) {
+    if (c.mem.opcode == VTA_OPCODE_LOAD || c.mem.opcode == VTA_OPCODE_STORE) {
       // Print instruction field information
-      if (c.mem.opcode == OPCODE_LOAD) {
+      if (c.mem.opcode == VTA_OPCODE_LOAD) {
         printf("LOAD ");
-        if (c.mem.memory_type == MEM_ID_UOP) printf("UOP\n");
-        if (c.mem.memory_type == MEM_ID_WGT) printf("WGT\n");
-        if (c.mem.memory_type == MEM_ID_INP) printf("INP\n");
-        if (c.mem.memory_type == MEM_ID_ACC) printf("ACC\n");
+        if (c.mem.memory_type == VTA_MEM_ID_UOP) printf("UOP\n");
+        if (c.mem.memory_type == VTA_MEM_ID_WGT) printf("WGT\n");
+        if (c.mem.memory_type == VTA_MEM_ID_INP) printf("INP\n");
+        if (c.mem.memory_type == VTA_MEM_ID_ACC) printf("ACC\n");
       }
-      if (c.mem.opcode == OPCODE_STORE) {
+      if (c.mem.opcode == VTA_OPCODE_STORE) {
         printf("STORE ACC\n");
       }
       printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-        (int) c.mem.pop_prev_dep, (int) c.mem.pop_next_dep,
-        (int) c.mem.push_prev_dep, (int) c.mem.push_next_dep);
-      printf("\tDRAM: 0x%08x, SRAM:0x%04x\n", (int) c.mem.dram_base, (int) c.mem.sram_base);
-      printf("\ty: size=%d, pad=[%d, %d]\n", (int) c.mem.y_size, (int) c.mem.y_pad_0,
-        (int) c.mem.y_pad_1);
-      printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n", (int) c.mem.x_size, (int) c.mem.x_stride,
-        (int) c.mem.x_pad_0, (int) c.mem.x_pad_1);
-      if (c.mem.opcode == OPCODE_STORE) {
-        if (c.mem.pop_prev_dep) g2s_queue --;
-        if (c.mem.push_prev_dep) s2g_queue ++;
-      } else if (c.mem.opcode == OPCODE_LOAD &&
-        (c.mem.memory_type == MEM_ID_INP || c.mem.memory_type == MEM_ID_WGT)) {
-        if (c.mem.pop_next_dep) g2l_queue --;
-        if (c.mem.push_next_dep) l2g_queue ++;
+             static_cast<int>(c.mem.pop_prev_dep),
+             static_cast<int>(c.mem.pop_next_dep),
+             static_cast<int>(c.mem.push_prev_dep),
+             static_cast<int>(c.mem.push_next_dep));
+      printf("\tDRAM: 0x%08x, SRAM:0x%04x\n",
+             static_cast<int>(c.mem.dram_base),
+             static_cast<int>(c.mem.sram_base));
+      printf("\ty: size=%d, pad=[%d, %d]\n",
+             static_cast<int>(c.mem.y_size),
+             static_cast<int>(c.mem.y_pad_0),
+             static_cast<int>(c.mem.y_pad_1));
+      printf("\tx: size=%d, stride=%d, pad=[%d, %d]\n",
+             static_cast<int>(c.mem.x_size),
+             static_cast<int>(c.mem.x_stride),
+             static_cast<int>(c.mem.x_pad_0),
+             static_cast<int>(c.mem.x_pad_1));
+      if (c.mem.opcode == VTA_OPCODE_STORE) {
+        if (c.mem.pop_prev_dep) g2s_queue--;
+        if (c.mem.push_prev_dep) s2g_queue++;
+      } else if (c.mem.opcode == VTA_OPCODE_LOAD &&
+        (c.mem.memory_type == VTA_MEM_ID_INP || c.mem.memory_type == VTA_MEM_ID_WGT)) {
+        if (c.mem.pop_next_dep) g2l_queue--;
+        if (c.mem.push_next_dep) l2g_queue++;
       } else {
-        if (c.mem.pop_prev_dep) l2g_queue --;
-        if (c.mem.push_prev_dep) g2l_queue ++;
-        if (c.mem.pop_next_dep) s2g_queue --;
-        if (c.mem.push_next_dep) g2s_queue ++;
+        if (c.mem.pop_prev_dep) l2g_queue--;
+        if (c.mem.push_prev_dep) g2l_queue++;
+        if (c.mem.pop_next_dep) s2g_queue--;
+        if (c.mem.push_next_dep) g2s_queue++;
       }
-    } else if (c.mem.opcode == OPCODE_GEMM) {
+    } else if (c.mem.opcode == VTA_OPCODE_GEMM) {
       // Print instruction field information
       printf("GEVM\n");
       printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-        (int) c.mem.pop_prev_dep, (int) c.mem.pop_next_dep,
-        (int) c.mem.push_prev_dep, (int) c.mem.push_next_dep);
-      printf("\trange (%d, %d)\n", (int) c.gemm.uop_bgn, (int) c.gemm.uop_end);
-      printf("\touter loop - iter: %d, acc: %d, inp: %d, wgt: %d\n", (int) c.gemm.iter_out,
-        (int) c.gemm.dst_factor_out, (int) c.gemm.src_factor_out,
-        (int) c.gemm.wgt_factor_out);
-      printf("\tinner loop - iter: %d, acc: %d, inp: %d, wgt: %d\n", (int) c.gemm.iter_in,
-        (int) c.gemm.dst_factor_in, (int) c.gemm.src_factor_in,
-        (int) c.gemm.wgt_factor_in);
-      if (c.gemm.pop_prev_dep) l2g_queue --;
-      if (c.gemm.push_prev_dep) g2l_queue ++;
-      if (c.gemm.pop_next_dep) s2g_queue --;
-      if (c.gemm.push_next_dep) g2s_queue ++;
-    } else if (c.mem.opcode == OPCODE_FINISH) {
+             static_cast<int>(c.mem.pop_prev_dep),
+             static_cast<int>(c.mem.pop_next_dep),
+             static_cast<int>(c.mem.push_prev_dep),
+             static_cast<int>(c.mem.push_next_dep));
+      printf("\trange (%d, %d)\n",
+             static_cast<int>(c.gemm.uop_bgn),
+             static_cast<int>(c.gemm.uop_end));
+      printf("\touter loop - iter: %d, acc: %d, inp: %d, wgt: %d\n",
+             static_cast<int>(c.gemm.iter_out),
+             static_cast<int>(c.gemm.dst_factor_out),
+             static_cast<int>(c.gemm.src_factor_out),
+             static_cast<int>(c.gemm.wgt_factor_out));
+      printf("\tinner loop - iter: %d, acc: %d, inp: %d, wgt: %d\n",
+             static_cast<int>(c.gemm.iter_in),
+             static_cast<int>(c.gemm.dst_factor_in),
+             static_cast<int>(c.gemm.src_factor_in),
+             static_cast<int>(c.gemm.wgt_factor_in));
+      if (c.gemm.pop_prev_dep) l2g_queue--;
+      if (c.gemm.push_prev_dep) g2l_queue++;
+      if (c.gemm.pop_next_dep) s2g_queue--;
+      if (c.gemm.push_next_dep) g2s_queue++;
+    } else if (c.mem.opcode == VTA_OPCODE_FINISH) {
       printf("FINISH\n");
       printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-        (int) c.mem.pop_prev_dep, (int) c.mem.pop_next_dep,
-        (int) c.mem.push_prev_dep, (int) c.mem.push_next_dep);
-      if (c.gemm.pop_prev_dep) l2g_queue --;
-      if (c.gemm.push_prev_dep) g2l_queue ++;
-      if (c.gemm.pop_next_dep) s2g_queue --;
-      if (c.gemm.push_next_dep) g2s_queue ++;
-    } else if (c.mem.opcode == OPCODE_ALU) {
+             static_cast<int>(c.mem.pop_prev_dep),
+             static_cast<int>(c.mem.pop_next_dep),
+             static_cast<int>(c.mem.push_prev_dep),
+             static_cast<int>(c.mem.push_next_dep));
+      if (c.gemm.pop_prev_dep) l2g_queue--;
+      if (c.gemm.push_prev_dep) g2l_queue++;
+      if (c.gemm.pop_next_dep) s2g_queue--;
+      if (c.gemm.push_next_dep) g2s_queue++;
+    } else if (c.mem.opcode == VTA_OPCODE_ALU) {
       // Print instruction field information
       printf("ALU - %s\n", getOpcodeString(c.alu.alu_opcode, c.alu.use_imm));
       printf("\tdep - pop prev: %d, pop next: %d, push prev: %d, push next: %d\n",
-        (int) c.mem.pop_prev_dep, (int) c.mem.pop_next_dep,
-        (int) c.mem.push_prev_dep, (int) c.mem.push_next_dep);
-      printf("\trange (%d, %d)\n", (int) c.alu.uop_bgn, (int) c.alu.uop_end);
-      printf("\touter loop - iter: %d, dst: %d, src: %d\n", (int) c.alu.iter_out,
-        (int) c.alu.dst_factor_out, (int) c.alu.src_factor_out);
-      printf("\tinner loop - iter: %d, dst: %d, src: %d\n", (int) c.alu.iter_in,
-        (int) c.alu.dst_factor_in, (int) c.alu.src_factor_in);
-      if (c.alu.pop_prev_dep) l2g_queue --;
-      if (c.alu.push_prev_dep) g2l_queue ++;
-      if (c.alu.pop_next_dep) s2g_queue --;
-      if (c.alu.push_next_dep) g2s_queue ++;
+             static_cast<int>(c.mem.pop_prev_dep),
+             static_cast<int>(c.mem.pop_next_dep),
+             static_cast<int>(c.mem.push_prev_dep),
+             static_cast<int>(c.mem.push_next_dep));
+      printf("\trange (%d, %d)\n",
+             static_cast<int>(c.alu.uop_bgn),
+             static_cast<int>(c.alu.uop_end));
+      printf("\touter loop - iter: %d, dst: %d, src: %d\n",
+             static_cast<int>(c.alu.iter_out),
+             static_cast<int>(c.alu.dst_factor_out),
+             static_cast<int>(c.alu.src_factor_out));
+      printf("\tinner loop - iter: %d, dst: %d, src: %d\n",
+             static_cast<int>(c.alu.iter_in),
+             static_cast<int>(c.alu.dst_factor_in),
+             static_cast<int>(c.alu.src_factor_in));
+      if (c.alu.pop_prev_dep) l2g_queue--;
+      if (c.alu.push_prev_dep) g2l_queue++;
+      if (c.alu.pop_next_dep) s2g_queue--;
+      if (c.alu.push_next_dep) g2s_queue++;
     }
   }
   printf("DEBUG - l2g_queue = %d, g2l_queue = %d\n", l2g_queue, g2l_queue);
@@ -632,174 +659,193 @@ void printInstruction(int num_insn, VTAGenericInsn *insns) {
 void printMicroOp(int num_uop, VTAUop *uops) {
   // Iterate over all micro ops
   printf("DEBUG - There are %u micro-ops\n", num_uop);
-  for (int i = 0; i < num_uop; i ++) {
+  for (int i = 0; i < num_uop; i++) {
     // Read micro-op
     printf("DEBUG - UOP %u: ", i);
     printf("rst_out=%u, acc=%u, inp= %u, wgt=%u\n", uops[i].reset_out, uops[i].dst_idx,
         uops[i].src_idx, uops[i].wgt_idx);
-
   }
 }
 
 int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_compression) {
-
-  assert(batch % BATCH == 0);
-  assert(vector_size % BLOCK_OUT == 0);
-  assert(!(opcode == ALU_OPCODE_SHL && !use_imm));
-  assert(!(opcode == ALU_OPCODE_SHR && !use_imm));
-
+  // Some assertions
+  assert(batch % VTA_BATCH == 0);
+  assert(vector_size % VTA_BLOCK_OUT == 0);
+  assert(!(opcode == VTA_ALU_OPCODE_SHL && !use_imm));
+  assert(!(opcode == VTA_ALU_OPCODE_SHR && !use_imm));
   printf("=====================================================================================\n");
   printf("INFO - ALU test of %s: batch=%d, vector_size=%d, uop_compression=%d\n",
     getOpcodeString(opcode, use_imm), batch, vector_size, uop_compression);
 
   // Instruction count
-  int ins_size = 3 * batch / BATCH + 2;
+  int ins_size = 3 * batch / VTA_BATCH + 2;
   // Micro op count
-  int uop_size = uop_compression ? 1 : vector_size / BLOCK_OUT;
+  int uop_size = uop_compression ? 1 : vector_size / VTA_BLOCK_OUT;
   // Input/output elements in each transfer
-  int tx_size = vector_size / BLOCK_OUT;
+  int tx_size = vector_size / VTA_BLOCK_OUT;
   // Number of input sets to be generated
   int input_sets = (use_imm) ? 1 : 2;
   // Make sure we don't exceed buffer bounds
-  assert(uop_size <= UOP_BUFF_DEPTH);
-  assert(tx_size * input_sets <= ACC_BUFF_DEPTH);
+  assert(uop_size <= VTA_UOP_BUFF_DEPTH);
+  assert(tx_size * input_sets <= VTA_ACC_BUFF_DEPTH);
 
   // Immediate values
-  acc_T *immediate = (acc_T *) malloc(sizeof(acc_T) * batch / BATCH);
-  for (int b = 0; b < batch / BATCH; b ++) {
-    if (opcode == ALU_OPCODE_MIN) {
-      immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1)));
-    } else if (opcode == ALU_OPCODE_MAX) {
-      immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1)));
-    } else if (opcode == ALU_OPCODE_ADD) {
-      immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1)));
-    } else if (opcode == ALU_OPCODE_SUB) {
-      immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1)));
-    } else if (opcode == ALU_OPCODE_MUL) {
-      immediate[b] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1)));
-    } else if (opcode == ALU_OPCODE_SHL) {
-      immediate[b] = (acc_T) (rand() % (INP_WIDTH + 1));
-    } else if (opcode == ALU_OPCODE_SHR) {
-      immediate[b] = (acc_T) (rand() % (INP_WIDTH + 1));
+  acc_T *immediate = static_cast<acc_T *>(malloc(sizeof(acc_T) * batch / VTA_BATCH));
+  for (int b = 0; b < batch / VTA_BATCH; b++) {
+    if (opcode == VTA_ALU_OPCODE_MIN) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
+    } else if (opcode == VTA_ALU_OPCODE_MAX) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
+    } else if (opcode == VTA_ALU_OPCODE_ADD) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
+    } else if (opcode == VTA_ALU_OPCODE_SUB) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
+    } else if (opcode == VTA_ALU_OPCODE_MUL) {
+      immediate[b] = static_cast<acc_T>(
+          rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
+    } else if (opcode == VTA_ALU_OPCODE_SHL) {
+      immediate[b] = static_cast<acc_T>(rand_r(&globalSeed) % (VTA_INP_WIDTH + 1));
+    } else if (opcode == VTA_ALU_OPCODE_SHR) {
+      immediate[b] = static_cast<acc_T>(rand_r(&globalSeed) % (VTA_INP_WIDTH + 1));
     }
   }
 
   // Initialize instructions
-  VTAGenericInsn *insn_buf = (VTAGenericInsn *) allocBuffer(sizeof(VTAGenericInsn) * ins_size);
+  VTAGenericInsn *insn_buf =
+      static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
   int insn_idx = 0;
-  insn_buf[insn_idx ++] = get1DLoadStoreInsn(OPCODE_LOAD, MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0);
-  for (int b = 0; b < batch; b += BATCH) {
-    insn_buf[insn_idx ++] = get2DLoadStoreInsn(
-      OPCODE_LOAD,                      // opcode
-      MEM_ID_ACC,                       // vector size
-      0,                                // sram offset
-      b / BATCH * tx_size * input_sets, // dram offset
-      1,                                // y size
-      tx_size * input_sets,             // x size
-      tx_size * input_sets,             // x stride
-      0,                                // y pad
-      0,                                // x pad
-      0,                                // pop prev dep
-      b > 0,                            // pop next dep
-      0,                                // push prev dep
-      0);                               // push next dep
-    insn_buf[insn_idx ++] = getALUInsn(
-      opcode,                           // opcode
-      tx_size,                          // vector size
-      use_imm,                          // use imm
-      immediate[b / BATCH],             // imm
-      uop_compression,                  // uop compression
-      0,                                // pop prev dep
-      0,                                // pop next dep
-      0,                                // push prev dep
-      1);                               // push next dep
-    insn_buf[insn_idx ++] = get2DLoadStoreInsn(
-      OPCODE_STORE,                     // opcode
-      MEM_ID_OUT,                       // vector size
-      0,                                // sram offset
-      b / BATCH * tx_size,              // dram offset
-      1,                                // y size
-      tx_size,                          // x size
-      tx_size,                          // x stride
-      0,                                // y pad
-      0,                                // x pad
-      1,                                // pop prev dep
-      0,                                // pop next dep
-      1,                                // push prev dep
-      0);                               // push next dep
+  insn_buf[insn_idx++] =
+      get1DLoadStoreInsn(VTA_OPCODE_LOAD, VTA_MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0);
+  for (int b = 0; b < batch; b += VTA_BATCH) {
+    insn_buf[insn_idx++] = get2DLoadStoreInsn(
+        VTA_OPCODE_LOAD,                   // opcode
+        VTA_MEM_ID_ACC,                    // vector size
+        0,                                 // sram offset
+        b / VTA_BATCH * tx_size * input_sets,  // dram offset
+        1,                                 // y size
+        tx_size * input_sets,              // x size
+        tx_size * input_sets,              // x stride
+        0,                                 // y pad
+        0,                                 // x pad
+        0,                                 // pop prev dep
+        b > 0,                             // pop next dep
+        0,                                 // push prev dep
+        0);                                // push next dep
+    insn_buf[insn_idx++] = getALUInsn(
+        opcode,                            // opcode
+        tx_size,                           // vector size
+        use_imm,                           // use imm
+        immediate[b / VTA_BATCH],          // imm
+        uop_compression,                   // uop compression
+        0,                                 // pop prev dep
+        0,                                 // pop next dep
+        0,                                 // push prev dep
+        1);                                // push next dep
+    insn_buf[insn_idx++] = get2DLoadStoreInsn(
+        VTA_OPCODE_STORE,                  // opcode
+        VTA_MEM_ID_OUT,                    // vector size
+        0,                                 // sram offset
+        b / VTA_BATCH * tx_size,           // dram offset
+        1,                                 // y size
+        tx_size,                           // x size
+        tx_size,                           // x stride
+        0,                                 // y pad
+        0,                                 // x pad
+        1,                                 // pop prev dep
+        0,                                 // pop next dep
+        1,                                 // push prev dep
+        0);                                // push next dep
   }
   // Finish
-  insn_buf[insn_idx ++] = getFinishInsn(0, 1);
-
+  insn_buf[insn_idx++] = getFinishInsn(0, 1);
   // Prepare the uop buffer
   VTAUop * uop_buf = getMapALUUops(tx_size, uop_compression);
 
-#if DEBUG==1
+#if VTA_DEBUG == 1
   printInstruction(ins_size, insn_buf);
   printMicroOp(uop_size, uop_buf);
 #endif
 
   // Initialize the input/output data
   acc_T **inputs = alloc2dArray<acc_T>(batch, vector_size * input_sets);
-  for (int i = 0; i < batch; i ++) {
-    for (int j = 0; j < vector_size * input_sets; j ++) {
-      if (opcode == ALU_OPCODE_MIN) {
-        inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2)));
-      } else if (opcode == ALU_OPCODE_MAX) {
-        inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2)));
-      } else if (opcode == ALU_OPCODE_ADD) {
-        inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2)));
-      } else if (opcode == ALU_OPCODE_SUB) {
-        inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2)));
-      } else if (opcode == ALU_OPCODE_MUL) {
-        inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH / 2)) - (1LL << (INP_WIDTH / 2 - 1)));
-      } else if (opcode == ALU_OPCODE_SHL) {
-        inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2)));
-      } else if (opcode == ALU_OPCODE_SHR) {
-        inputs[i][j] = (acc_T) (rand() % (1LL << (INP_WIDTH - 1)) - (1LL << (INP_WIDTH - 2)));
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < vector_size * input_sets; j++) {
+      if (opcode == VTA_ALU_OPCODE_MIN) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
+      } else if (opcode == VTA_ALU_OPCODE_MAX) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
+      } else if (opcode == VTA_ALU_OPCODE_ADD) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
+      } else if (opcode == VTA_ALU_OPCODE_SUB) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
+      } else if (opcode == VTA_ALU_OPCODE_MUL) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH / 2)) - (1LL << (VTA_INP_WIDTH / 2 - 1)));
+      } else if (opcode == VTA_ALU_OPCODE_SHL) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
+      } else if (opcode == VTA_ALU_OPCODE_SHR) {
+        inputs[i][j] = static_cast<acc_T>(
+            rand_r(&globalSeed) % (1LL << (VTA_INP_WIDTH - 1)) - (1LL << (VTA_INP_WIDTH - 2)));
       }
     }
   }
 
   // Compute reference output
   out_T **outputs_ref = alloc2dArray<out_T>(batch, vector_size);
-  for (int i = 0; i < batch; i ++) {
-    for (int j = 0; j < vector_size; j ++) {
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < vector_size; j++) {
       acc_T tmp = 0;
-      if (opcode == ALU_OPCODE_MIN) {
+      if (opcode == VTA_ALU_OPCODE_MIN) {
         if (!use_imm) {
-          tmp = inputs[i][j] < inputs[i][j + vector_size] ? inputs[i][j] : inputs[i][j + vector_size];
+          tmp = inputs[i][j] < inputs[i][j + vector_size] ?
+                    inputs[i][j] :
+                    inputs[i][j + vector_size];
         } else {
-          tmp = inputs[i][j] < immediate[i / BATCH] ? inputs[i][j] : immediate[i / BATCH];
+          tmp = inputs[i][j] < immediate[i / VTA_BATCH] ?
+                    inputs[i][j] :
+                    immediate[i / VTA_BATCH];
         }
-      } else if (opcode == ALU_OPCODE_MAX) {
+      } else if (opcode == VTA_ALU_OPCODE_MAX) {
         if (!use_imm) {
-          tmp = inputs[i][j] > inputs[i][j + vector_size] ? inputs[i][j] : inputs[i][j + vector_size];
+          tmp = inputs[i][j] > inputs[i][j + vector_size] ?
+                    inputs[i][j] :
+                    inputs[i][j + vector_size];
         } else {
-          tmp = inputs[i][j] > immediate[i / BATCH] ? inputs[i][j] : immediate[i / BATCH];
+          tmp = inputs[i][j] > immediate[i / VTA_BATCH] ?
+                    inputs[i][j] :
+                    immediate[i / VTA_BATCH];
         }
-      } else if (opcode == ALU_OPCODE_ADD) {
+      } else if (opcode == VTA_ALU_OPCODE_ADD) {
         if (!use_imm) {
           tmp = inputs[i][j] + inputs[i][j + vector_size];
         } else {
-          tmp = inputs[i][j] + immediate[i / BATCH];
+          tmp = inputs[i][j] + immediate[i / VTA_BATCH];
         }
-      } else if (opcode == ALU_OPCODE_SUB) {
+      } else if (opcode == VTA_ALU_OPCODE_SUB) {
         if (!use_imm) {
           tmp = inputs[i][j] - inputs[i][j + vector_size];
         } else {
-          tmp = inputs[i][j] - immediate[i / BATCH];
+          tmp = inputs[i][j] - immediate[i / VTA_BATCH];
         }
-      } else if (opcode == ALU_OPCODE_MUL) {
+      } else if (opcode == VTA_ALU_OPCODE_MUL) {
         if (!use_imm) {
           tmp = inputs[i][j] * inputs[i][j + vector_size];
         } else {
-          tmp = inputs[i][j] * immediate[i / BATCH];
+          tmp = inputs[i][j] * immediate[i / VTA_BATCH];
         }
-      } else if (opcode == ALU_OPCODE_SHL) {
-        tmp = inputs[i][j] << immediate[i / BATCH];
-      } else if (opcode == ALU_OPCODE_SHR) {
-        tmp = inputs[i][j] >> immediate[i / BATCH];
+      } else if (opcode == VTA_ALU_OPCODE_SHL) {
+        tmp = inputs[i][j] << immediate[i / VTA_BATCH];
+      } else if (opcode == VTA_ALU_OPCODE_SHR) {
+        tmp = inputs[i][j] >> immediate[i / VTA_BATCH];
       }
       // Set
       outputs_ref[i][j] = (out_T) tmp;
@@ -807,44 +853,51 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
   }
 
   // Pack input buffer
-  acc_T *bias_buf = (acc_T *) allocBuffer(ACC_ELEM_BYTES * batch * tx_size * input_sets);
-  packBuffer<acc_T, ACC_WIDTH>(bias_buf, inputs, batch, vector_size * input_sets, BATCH, BLOCK_OUT);
+  acc_T *bias_buf =
+      static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * batch * tx_size * input_sets));
+  packBuffer<acc_T, VTA_ACC_WIDTH>(
+      bias_buf, inputs, batch, vector_size * input_sets, VTA_BATCH, VTA_BLOCK_OUT);
 
   // Prepare output buffer
-  out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * batch * tx_size * input_sets);
+  out_T *output_buf =
+      static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * batch * tx_size * input_sets));
 
 #ifdef NO_SIM
   // Invoke the VTA
   uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, NULL, NULL, bias_buf, output_buf);
   // Report on timining
-  printf("INFO - Synchronization time: %.3lfms\n", (double) t_fpga / 1E6);
-  printf("INFO - Throughput: %.3lfGOps/s\n", (double) vector_size * batch / t_fpga);
+  printf("INFO - Synchronization time: %.3fms\n", static_cast<float>(t_fpga) / 1E6);
+  printf("INFO - Throughput: %.3fGOps/s\n", static_cast<float>(vector_size * batch) / t_fpga);
 #else
   // Invoke the VTA
-  vta(
-    ins_size,
-    (volatile insn_T *) insn_buf,
-    (volatile uop_T *) uop_buf,
-    (volatile inp_vec_T *) NULL,
-    (volatile wgt_vec_T *) NULL,
-    (volatile acc_vec_T *) bias_buf,
-    (volatile inp_vec_T *) output_buf
-  );
+  vta(ins_size,
+      (volatile insn_T *) insn_buf,
+      (volatile uop_T *) uop_buf,
+      (volatile inp_vec_T *) NULL,
+      (volatile wgt_vec_T *) NULL,
+      (volatile acc_vec_T *) bias_buf,
+      (volatile inp_vec_T *) output_buf);
 #endif
 
   // Unpack output buffer
   out_T **outputs = alloc2dArray<out_T>(batch, vector_size);
-  unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, vector_size, BATCH, BLOCK_OUT);
+  unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
+                                     output_buf,
+                                     batch,
+                                     vector_size,
+                                     VTA_BATCH,
+                                     VTA_BLOCK_OUT);
 
   // Correctness checks
   int err = 0;
-  for (int i = 0; i < batch; i ++) {
-    for (int j = 0; j < vector_size; j ++) {
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < vector_size; j++) {
       if (outputs_ref[i][j] != outputs[i][j]) {
         err++;
-#if DEBUG==1
-        printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j, (int) outputs_ref[i][j],
-            (int) outputs[i][j]);
+#if VTA_DEBUG == 1
+        printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
+               static_cast<int>(outputs_ref[i][j]),
+               static_cast<int>(outputs[i][j]));
 #endif
       }
     }
@@ -867,169 +920,180 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
     printf("INFO - ALU test failed, got %d errors!\n", err);
     return -1;
   }
-
 }
 
 int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
     int virtual_threads) {
-
-  assert(block % BLOCK_IN == 0);
-  assert(block % BLOCK_OUT == 0);
-  assert(block % BATCH == 0);
+  // Some assertions
+  assert(block % VTA_BLOCK_IN == 0);
+  assert(block % VTA_BLOCK_OUT == 0);
+  assert(block % VTA_BATCH == 0);
   assert(channels % block == 0);
   assert(batch % block == 0);
 
   printf("=====================================================================================\n");
-  printf("INFO - Blocked GEMM test: batch=%d, channels=%d, block=%d, uop_compression=%d, \
-virtual_threads=%d\n",
-    batch, channels, block, uop_compression, virtual_threads);
+  printf("INFO - Blocked GEMM test: batch=%d, channels=%d, block=%d, uop_comp=%d, vt=%d\n",
+         batch, channels, block, uop_compression, virtual_threads);
 
   // Input/output channels
   int in_feat = channels;
   int out_feat = channels;
   // Derive number of elements that need to be loaded/stored
   int ins_size = batch / block * out_feat / block * (2 + in_feat / block * 3) + 2;
-  int uop_size = uop_compression ? block / BATCH * virtual_threads :
-    block / BATCH * block / BLOCK_IN * block / BLOCK_OUT * virtual_threads;
-  int inp_size = batch / BATCH * in_feat / BLOCK_IN;
-  int wgt_size = in_feat / BLOCK_IN * out_feat / BLOCK_OUT;
-  int out_size = batch / BATCH * out_feat / BLOCK_OUT;
+  int uop_size = uop_compression ?
+      block / VTA_BATCH * virtual_threads :
+      block / VTA_BATCH * block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT * virtual_threads;
+  int inp_size = batch / VTA_BATCH * in_feat / VTA_BLOCK_IN;
+  int wgt_size = in_feat / VTA_BLOCK_IN * out_feat / VTA_BLOCK_OUT;
+  int out_size = batch / VTA_BATCH * out_feat / VTA_BLOCK_OUT;
   // Blocked buffer sizes (in terms of elements)
-  int inp_block_size = block / BATCH * block / BLOCK_IN;
-  int wgt_block_size = block / BLOCK_IN * block / BLOCK_OUT;
-  int out_block_size = block / BATCH * block / BLOCK_OUT;
+  int inp_block_size = block / VTA_BATCH * block / VTA_BLOCK_IN;
+  int wgt_block_size = block / VTA_BLOCK_IN * block / VTA_BLOCK_OUT;
+  int out_block_size = block / VTA_BATCH * block / VTA_BLOCK_OUT;
   // Make sure we don't exceed buffer bounds
-  assert(uop_size <= UOP_BUFF_DEPTH);
-  assert(inp_block_size <= INP_BUFF_DEPTH);
-  assert(wgt_block_size <= WGT_BUFF_DEPTH);
-  assert(out_block_size <= ACC_BUFF_DEPTH);
+  assert(uop_size <= VTA_UOP_BUFF_DEPTH);
+  assert(inp_block_size <= VTA_INP_BUFF_DEPTH);
+  assert(wgt_block_size <= VTA_WGT_BUFF_DEPTH);
+  assert(out_block_size <= VTA_ACC_BUFF_DEPTH);
 
   // Initialize instruction buffer
-  VTAGenericInsn *insn_buf = (VTAGenericInsn *) allocBuffer(sizeof(VTAGenericInsn) * ins_size);
+  VTAGenericInsn *insn_buf =
+      static_cast<VTAGenericInsn *>(allocBuffer(sizeof(VTAGenericInsn) * ins_size));
   int insn_idx = 0;
 
   // Load uops
-  insn_buf[insn_idx ++] = get1DLoadStoreInsn(OPCODE_LOAD, MEM_ID_UOP, 0, 0, uop_size, 0, 0, 0, 0);
+  insn_buf[insn_idx++] = get1DLoadStoreInsn(VTA_OPCODE_LOAD,
+                                            VTA_MEM_ID_UOP,
+                                            0,
+                                            0,
+                                            uop_size,
+                                            0,
+                                            0,
+                                            0,
+                                            0);
   // Iterate over batch blocks
   for (int i = 0; i < batch; i += block) {
     // Iterate over output channel blocks
     for (int j = 0; j < out_feat; j += block) {
       // Load bias block (pop next if not first, push prev)
-      insn_buf[insn_idx ++] = get2DLoadStoreInsn(
-        OPCODE_LOAD,                                        // opcode
-        MEM_ID_ACC,                                         // type
-        0,                                                  // sram offset
-        (i / BATCH * out_feat + j) / BLOCK_OUT,             // dram offset
-        block / BATCH,                                      // y size
-        block / BLOCK_OUT,                                  // x size
-        out_feat / BLOCK_OUT,                               // x stride
-        0,                                                  // y pad
-        0,                                                  // x pad
-        0,                                                  // pop prev dep
-        (i > 0 || j > 0),                                   // pop next dep
-        (virtual_threads == 1),                             // push prev dep
-        0);                                                 // push next dep
+      insn_buf[insn_idx++] = get2DLoadStoreInsn(
+          VTA_OPCODE_LOAD,                                    // opcode
+          VTA_MEM_ID_ACC,                                     // type
+          0,                                                  // sram offset
+          (i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT,     // dram offset
+          block / VTA_BATCH,                                  // y size
+          block / VTA_BLOCK_OUT,                              // x size
+          out_feat / VTA_BLOCK_OUT,                           // x stride
+          0,                                                  // y pad
+          0,                                                  // x pad
+          0,                                                  // pop prev dep
+          (i > 0 || j > 0),                                   // pop next dep
+          (virtual_threads == 1),                             // push prev dep
+          0);                                                 // push next dep
       // Iterate over input channel blocks
       for (int k = 0; k < in_feat; k += block * virtual_threads) {
         for (int l = 0; l < block * virtual_threads; l += block) {
           // Derive dependence flags
-          bool pop = (virtual_threads == 1) ? 
-            1 :
-            (i > 0 || j > 0 || k > 0 || l > 0) && (k + l != block * virtual_threads - block);
+          bool pop = (virtual_threads == 1) ?
+              1 :
+              (i > 0 || j > 0 || k > 0 || l > 0) && (k + l != block * virtual_threads - block);
           bool push_prev = (virtual_threads == 1) ?
-            ((k + l) != in_feat - block) :
-            ((k + l) != in_feat - virtual_threads * block) && 
-            (
-              (k + l != in_feat - block) ||
-              (j != out_feat - block) ||
-              (i != batch - block)
-            );
+              ((k + l) != in_feat - block) :
+              ((k + l) != in_feat - virtual_threads * block) &&
+              (
+                  (k + l != in_feat - block) ||
+                  (j != out_feat - block) ||
+                  (i != batch - block));
           bool push_next = (k + l == in_feat - block);
           // Load weight block (pop next)
-          insn_buf[insn_idx ++] = get2DLoadStoreInsn(
-            OPCODE_LOAD,                                    // opcode
-            MEM_ID_WGT,                                     // type
-            l / BLOCK_IN * block / BLOCK_OUT,               // sram offset
-            (j / BLOCK_OUT * in_feat + k + l) / BLOCK_IN,   // dram offset
-            block / BLOCK_OUT,                              // y size
-            block / BLOCK_IN,                               // x size
-            in_feat / BLOCK_IN,                             // x stride
-            0,                                              // y pad
-            0,                                              // x pad
-            0,                                              // pop prev dep
-            pop,                                            // pop next dep
-            0,                                              // push prev dep
-            0);                                             // push next dep
+          insn_buf[insn_idx++] = get2DLoadStoreInsn(
+              VTA_OPCODE_LOAD,                                // opcode
+              VTA_MEM_ID_WGT,                                 // type
+              l / VTA_BLOCK_IN * block / VTA_BLOCK_OUT,       // sram offset
+              (j / VTA_BLOCK_OUT * in_feat + k + l) / VTA_BLOCK_IN,  // dram offset
+              block / VTA_BLOCK_OUT,                          // y size
+              block / VTA_BLOCK_IN,                           // x size
+              in_feat / VTA_BLOCK_IN,                         // x stride
+              0,                                              // y pad
+              0,                                              // x pad
+              0,                                              // pop prev dep
+              pop,                                            // pop next dep
+              0,                                              // push prev dep
+              0);                                             // push next dep
           // Load input block (push next)
-          insn_buf[insn_idx ++] = get2DLoadStoreInsn(
-            OPCODE_LOAD,                                    // opcode
-            MEM_ID_INP,                                     // type
-            l / BLOCK_IN * block / BATCH,                   // sram offset
-            (i / BATCH * in_feat + k + l) / BLOCK_IN,       // dram offset
-            block / BATCH,                                  // y size
-            block / BLOCK_IN,                               // x size
-            in_feat / BLOCK_IN,                             // x stride
-            0,                                              // y pad
-            0,                                              // x pad
-            0,                                              // pop prev dep
-            0,                                              // pop next dep
-            0,                                              // push prev dep
-            1);                                             // push next dep
+          insn_buf[insn_idx++] = get2DLoadStoreInsn(
+              VTA_OPCODE_LOAD,                                // opcode
+              VTA_MEM_ID_INP,                                 // type
+              l / VTA_BLOCK_IN * block / VTA_BATCH,           // sram offset
+              (i / VTA_BATCH * in_feat + k + l) / VTA_BLOCK_IN,  // dram offset
+              block / VTA_BATCH,                              // y size
+              block / VTA_BLOCK_IN,                           // x size
+              in_feat / VTA_BLOCK_IN,                         // x stride
+              0,                                              // y pad
+              0,                                              // x pad
+              0,                                              // pop prev dep
+              0,                                              // pop next dep
+              0,                                              // push prev dep
+              1);                                             // push next dep
           // Perform GEMM (pop prev, push prev if not last, push next if last)
-          insn_buf[insn_idx ++] = getGEMMInsn(
-            l / block * uop_size / virtual_threads,         // uop offset
-            block / BATCH,                                  // batch
-            block / BLOCK_IN,                               // in_feat
-            block / BLOCK_OUT,                              // out_feat
-            uop_compression,                                // uop_compression
-            1,                                              // pop_prev_dep
-            0,                                              // pop_next_dep
-            push_prev,                                      // push prev dep
-            push_next);                                     // push_next_dep
+          insn_buf[insn_idx++] = getGEMMInsn(
+              l / block * uop_size / virtual_threads,         // uop offset
+              block / VTA_BATCH,                              // batch
+              block / VTA_BLOCK_IN,                           // in_feat
+              block / VTA_BLOCK_OUT,                          // out_feat
+              uop_compression,                                // uop_compression
+              1,                                              // pop_prev_dep
+              0,                                              // pop_next_dep
+              push_prev,                                      // push prev dep
+              push_next);                                     // push_next_dep
         }
       }
       // Store output block (pop prev, push prev if not last)
-      insn_buf[insn_idx ++] = get2DLoadStoreInsn(
-        OPCODE_STORE,                                       // opcode
-        MEM_ID_OUT,                                         // type
-        0,                                                  // sram offset
-        (i / BATCH * out_feat + j) / BLOCK_OUT,             // dram offset
-        block / BATCH,                                      // y size
-        block / BLOCK_OUT,                                  // x size
-        out_feat / BLOCK_OUT,                               // x stride
-        0,                                                  // y pad
-        0,                                                  // x pad
-        1,                                                  // pop prev dep
-        0,                                                  // pop next dep
-        1,                                                  // pop prev dep
-        0);                                                 // push next dep
+      insn_buf[insn_idx++] = get2DLoadStoreInsn(
+          VTA_OPCODE_STORE,                                   // opcode
+          VTA_MEM_ID_OUT,                                     // type
+          0,                                                  // sram offset
+          (i / VTA_BATCH * out_feat + j) / VTA_BLOCK_OUT,     // dram offset
+          block / VTA_BATCH,                                  // y size
+          block / VTA_BLOCK_OUT,                              // x size
+          out_feat / VTA_BLOCK_OUT,                           // x stride
+          0,                                                  // y pad
+          0,                                                  // x pad
+          1,                                                  // pop prev dep
+          0,                                                  // pop next dep
+          1,                                                  // pop prev dep
+          0);                                                 // push next dep
     }
   }
   // Finish
-  insn_buf[insn_idx ++] = getFinishInsn(0, 1);
+  insn_buf[insn_idx++] = getFinishInsn(0, 1);
 
   // Prepare the uop buffer
-  VTAUop * uop_buf = getGEMMUops(block / BATCH, block / BLOCK_IN, block / BLOCK_OUT, uop_compression,
-    virtual_threads > 1);
-
-#if DEBUG==1
+  VTAUop * uop_buf = getGEMMUops(
+      block / VTA_BATCH,
+      block / VTA_BLOCK_IN,
+      block / VTA_BLOCK_OUT,
+      uop_compression,
+      virtual_threads > 1);
+
+#if VTA_DEBUG == 1
   printInstruction(ins_size, insn_buf);
   printMicroOp(uop_size, uop_buf);
 #endif
 
   // Initialize inputs
-  inp_T **inputs = allocInit2dArray<inp_T, INP_WIDTH>(batch, in_feat);
+  inp_T **inputs = allocInit2dArray<inp_T, VTA_INP_WIDTH>(batch, in_feat);
   // Initialize weights
-  wgt_T **weights = allocInit2dArray<wgt_T, WGT_WIDTH>(out_feat, in_feat);
+  wgt_T **weights = allocInit2dArray<wgt_T, VTA_WGT_WIDTH>(out_feat, in_feat);
   // Initialize biases
-  acc_T **biases = allocInit2dArray<acc_T, ACC_WIDTH>(batch, out_feat);
+  acc_T **biases = allocInit2dArray<acc_T, VTA_ACC_WIDTH>(batch, out_feat);
 
   // Reference GEMM implementation
   out_T **outputs_ref = alloc2dArray<out_T>(batch, out_feat);
-  for (int i = 0; i < batch; i ++) {
-    for (int j = 0; j < out_feat; j ++) {
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < out_feat; j++) {
       acc_T sum = biases[i][j];
-      for (int k = 0; k < in_feat; k ++) {
+      for (int k = 0; k < in_feat; k++) {
         sum += (acc_T) (inputs[i][k] * weights[j][k]);
       }
       // Set
@@ -1038,49 +1102,75 @@ virtual_threads=%d\n",
   }
 
   // Prepare the input buffer
-  inp_T *input_buf = (inp_T *) allocBuffer(INP_ELEM_BYTES * inp_size);
-  packBuffer<inp_T, INP_WIDTH>(input_buf, inputs, batch, in_feat, BATCH, BLOCK_IN);
+  inp_T *input_buf = static_cast<inp_T *>(allocBuffer(VTA_INP_ELEM_BYTES * inp_size));
+  packBuffer<inp_T, VTA_INP_WIDTH>(input_buf,
+                                   inputs,
+                                   batch,
+                                   in_feat,
+                                   VTA_BATCH,
+                                   VTA_BLOCK_IN);
   // Prepare the weight buffer
-  wgt_T *weight_buf = (wgt_T *) allocBuffer(WGT_ELEM_BYTES * wgt_size);
-  packBuffer<wgt_T, WGT_WIDTH>(weight_buf, weights, out_feat, in_feat, BLOCK_OUT, BLOCK_IN);
+  wgt_T *weight_buf = static_cast<wgt_T *>(allocBuffer(VTA_WGT_ELEM_BYTES * wgt_size));
+  packBuffer<wgt_T, VTA_WGT_WIDTH>(weight_buf,
+                                   weights,
+                                   out_feat,
+                                   in_feat,
+                                   VTA_BLOCK_OUT,
+                                   VTA_BLOCK_IN);
   // Prepare the bias buffer
-  acc_T *bias_buf = (acc_T *) allocBuffer(ACC_ELEM_BYTES * out_size);
-  packBuffer<acc_T, ACC_WIDTH>(bias_buf, biases, batch, out_feat, BATCH, BLOCK_OUT);
+  acc_T *bias_buf = static_cast<acc_T *>(allocBuffer(VTA_ACC_ELEM_BYTES * out_size));
+  packBuffer<acc_T, VTA_ACC_WIDTH>(bias_buf,
+                                   biases,
+                                   batch,
+                                   out_feat,
+                                   VTA_BATCH,
+                                   VTA_BLOCK_OUT);
   // Prepare the output buffer
-  out_T *output_buf = (out_T *) allocBuffer(INP_ELEM_BYTES * out_size);
+  out_T *output_buf = static_cast<out_T *>(allocBuffer(VTA_INP_ELEM_BYTES * out_size));
 
 #ifdef NO_SIM
   // Invoke the VTA
-  uint64_t t_fpga = vta(ins_size, insn_buf, uop_buf, input_buf, weight_buf, bias_buf, output_buf);
+  uint64_t t_fpga = vta(ins_size,
+                        insn_buf,
+                        uop_buf,
+                        input_buf,
+                        weight_buf,
+                        bias_buf,
+                        output_buf);
   // Report on timining
-  printf("INFO - Synchronization time: %.3lfms\n", (double) t_fpga / 1E6);
-  printf("INFO - Throughput: %.3lfGOPs/s\n", (double) batch * in_feat * out_feat * 2 / t_fpga);
+  printf("INFO - Synchronization time: %.3lfms\n", static_cast<float>(t_fpga) / 1E6);
+  printf("INFO - Throughput: %.3lfGOPs/s\n",
+         static_cast<float>(batch) * in_feat * out_feat * 2 / t_fpga);
 #else
   // Invoke the VTA
-  vta(
-    ins_size,
-    (volatile insn_T *) insn_buf,
-    (volatile uop_T *) uop_buf,
-    (volatile inp_vec_T *) input_buf,
-    (volatile wgt_vec_T *) weight_buf,
-    (volatile acc_vec_T *) bias_buf,
-    (volatile inp_vec_T *) output_buf
-  );
+  vta(ins_size,
+      (volatile insn_T *) insn_buf,
+      (volatile uop_T *) uop_buf,
+      (volatile inp_vec_T *) input_buf,
+      (volatile wgt_vec_T *) weight_buf,
+      (volatile acc_vec_T *) bias_buf,
+      (volatile inp_vec_T *) output_buf);
 #endif
 
   // Unpack output data
   out_T **outputs = alloc2dArray<out_T>(batch, out_feat);
-  unpackBuffer<out_T, OUT_WIDTH>(outputs, output_buf, batch, out_feat, BATCH, BLOCK_OUT);
+  unpackBuffer<out_T, VTA_OUT_WIDTH>(outputs,
+                                     output_buf,
+                                     batch,
+                                     out_feat,
+                                     VTA_BATCH,
+                                     VTA_BLOCK_OUT);
 
   // Correctness checks
   int err = 0;
-  for (int i = 0; i < batch; i ++) {
-    for (int j = 0; j < out_feat; j ++) {
+  for (int i = 0; i < batch; i++) {
+    for (int j = 0; j < out_feat; j++) {
       if (outputs_ref[i][j] != outputs[i][j]) {
         err++;
-#if DEBUG==1
-        printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j, (int) outputs_ref[i][j],
-            (int) outputs[i][j]);
+#if VTA_DEBUG == 1
+        printf("DEBUG - %d, %d: expected 0x%x but got 0x%x\n", i, j,
+               static_cast<int>(outputs_ref[i][j]),
+               static_cast<int>(outputs[i][j]));
 #endif
       }
     }
@@ -1092,12 +1182,12 @@ virtual_threads=%d\n",
   free2dArray<acc_T>(biases, batch, out_feat);
   free2dArray<out_T>(outputs_ref, batch, out_feat);
   free2dArray<out_T>(outputs, batch, out_feat);
-  freeBuffer((void *) insn_buf);
-  freeBuffer((void *) uop_buf);
-  freeBuffer((void *) input_buf);
-  freeBuffer((void *) weight_buf);
-  freeBuffer((void *) bias_buf);
-  freeBuffer((void *) output_buf);
+  freeBuffer(insn_buf);
+  freeBuffer(uop_buf);
+  freeBuffer(input_buf);
+  freeBuffer(weight_buf);
+  freeBuffer(bias_buf);
+  freeBuffer(output_buf);
 
   if (err == 0) {
     printf("INFO - Blocked GEMM test successful!\n");
@@ -1106,5 +1196,4 @@ virtual_threads=%d\n",
     printf("INFO - Blocked GEMM test failed, got %d errors!\n", err);
     return -1;
   }
-
 }
diff --git a/vta/tests/hardware/common/test_lib.h b/vta/tests/hardware/common/test_lib.h
index fad2e4daddfb26e4df44a90fff7a980d0b18bda9..037e2fcee72f4feb0e8dc200e984f5a57f539aa3 100644
--- a/vta/tests/hardware/common/test_lib.h
+++ b/vta/tests/hardware/common/test_lib.h
@@ -4,8 +4,8 @@
  * \brief Test library for the VTA design simulation and driver tests.
  */
 
-#ifndef VTA_TESTLIB_H_
-#define VTA_TESTLIB_H_
+#ifndef TESTS_HARDWARE_COMMON_TEST_LIB_H_
+#define TESTS_HARDWARE_COMMON_TEST_LIB_H_
 
 #include <assert.h>
 #include <stdint.h>
@@ -17,9 +17,9 @@
 
 #include <vta/driver.h>
 
-#ifdef PYNQ_TARGET
+#ifdef VTA_PYNQ_TARGET
 #include "../../../src/pynq/pynq_driver.h"
-#endif //PYNQ_TARGET
+#endif  // VTA_PYNQ_TARGET
 
 typedef uint64_t axi_T;
 typedef uint32_t uop_T;
@@ -28,7 +28,7 @@ typedef int8_t inp_T;
 typedef int8_t out_T;
 typedef int32_t acc_T;
 
-uint64_t vta (
+uint64_t vta(
   uint32_t insn_count,
   VTAGenericInsn *insns,
   VTAUop *uops,
@@ -37,11 +37,11 @@ uint64_t vta (
   acc_T *biases,
   inp_T *outputs);
 
-#else //NO_SIM
+#else  // NO_SIM
 
 #include "../../../hardware/vivado/src/vta.h"
 
-#endif //NO_SIM
+#endif  // NO_SIM
 
 /*!
 * \brief Returns opcode string.
@@ -300,4 +300,4 @@ int alu_test(int opcode, bool use_imm, int batch, int vector_size, bool uop_comp
 int blocked_gemm_test(int batch, int channels, int block, bool uop_compression,
   int virtual_threads);
 
-#endif  // VTA_TESTLIB_H_
\ No newline at end of file
+#endif  //  TESTS_HARDWARE_COMMON_TEST_LIB_H_
diff --git a/vta/tests/hardware/pynq/metal_test.cc b/vta/tests/hardware/pynq/metal_test.cc
index b5147399c18c450a1580dc4c2b54a177ad8e85e0..01e73f46bd7200396b26fc987b0805f974306f4a 100644
--- a/vta/tests/hardware/pynq/metal_test.cc
+++ b/vta/tests/hardware/pynq/metal_test.cc
@@ -14,140 +14,135 @@
 #include "../common/test_lib.h"
 
 // VTA invocation (present the same abstraction as in the simulation tests)
-uint64_t vta (
-    uint32_t insn_count,
-    VTAGenericInsn *insns,
-    VTAUop *uops,
-    inp_T *inputs,
-    wgt_T *weights,
-    acc_T *biases,
-    inp_T *outputs) {
-
-    // Performance counter variables
-    uint64_t t_fpga;
-    struct timespec start, stop;
-
-    // Derive bitstream file
-    char bitstream[64];
-    char str_batch_size[4];
-    char str_block_out_size[4];
-    char str_block_in_size[4];
-    char str_block_bit_width[4];
-    sprintf(str_batch_size, "%d", BATCH);
-    sprintf(str_block_out_size, "%d", BLOCK_OUT);
-    sprintf(str_block_in_size, "%d", BLOCK_IN);
-    sprintf(str_block_bit_width, "%d", WGT_WIDTH);
-    strcpy(bitstream, "vta.bit");
-
-#if DEBUG==1
-    printf("INFO - Programming FPGA: %s!\n", bitstream);
+uint64_t vta(
+  uint32_t insn_count,
+  VTAGenericInsn *insns,
+  VTAUop *uops,
+  inp_T *inputs,
+  wgt_T *weights,
+  acc_T *biases,
+  inp_T *outputs) {
+  // Performance counter variables
+  uint64_t t_fpga;
+  struct timespec start, stop;
+
+  // Derive bitstream file
+  char bitstream[128];
+  char str_batch_size[4];
+  char str_block_out_size[4];
+  char str_block_in_size[4];
+  char str_block_bit_width[4];
+  snprintf(str_batch_size, sizeof(str_batch_size), "%d", VTA_BATCH);
+  snprintf(str_block_out_size, sizeof(str_block_out_size), "%d", VTA_BLOCK_OUT);
+  snprintf(str_block_in_size, sizeof(str_block_in_size), "%d", VTA_BLOCK_IN);
+  snprintf(str_block_bit_width, sizeof(str_block_bit_width), "%d", VTA_WGT_WIDTH);
+  snprintf(bitstream, sizeof(bitstream), "%s", "vta.bit");
+
+#if VTA_DEBUG == 1
+  printf("INFO - Programming FPGA: %s!\n", bitstream);
 #endif
 
-    // Program VTA
-    VTAProgram(bitstream);
-    // Get VTA handles
-    VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
-    VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
-    VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
-    VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
-
-    // Physical address pointers
-    uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
-    uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
-    uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
-    uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
-    uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
-    uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
-
-#if DEBUG==1
-    printf("INFO - Starting FPGA!\n");
+  // Program VTA
+  VTAProgram(bitstream);
+  // Get VTA handles
+  VTAHandle vta_fetch_handle = VTAMapRegister(VTA_FETCH_ADDR, VTA_RANGE);
+  VTAHandle vta_load_handle = VTAMapRegister(VTA_LOAD_ADDR, VTA_RANGE);
+  VTAHandle vta_compute_handle = VTAMapRegister(VTA_COMPUTE_ADDR, VTA_RANGE);
+  VTAHandle vta_store_handle = VTAMapRegister(VTA_STORE_ADDR, VTA_RANGE);
+
+  // Physical address pointers
+  uint32_t insn_phy = insns ? cma_get_phy_addr(insns) : 0;
+  uint32_t uop_phy = uops ? cma_get_phy_addr(uops) : 0;
+  uint32_t input_phy = inputs ? cma_get_phy_addr(inputs) : 0;
+  uint32_t weight_phy = weights ? cma_get_phy_addr(weights) : 0;
+  uint32_t bias_phy = biases ? cma_get_phy_addr(biases) : 0;
+  uint32_t output_phy = outputs ? cma_get_phy_addr(outputs) : 0;
+
+#if VTA_DEBUG == 1
+  printf("INFO - Starting FPGA!\n");
 #endif
 
-    clock_gettime(CLOCK_REALTIME, &start);
-
-    // FETCH @ 0x10 : Data signal of insn_count_V
-    VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
-    // FETCH @ 0x18 : Data signal of insns_V
-    if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
-    // LOAD @ 0x10 : Data signal of inputs_V
-    if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
-    // LOAD @ 0x18 : Data signal of weight_V
-    if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
-    // COMPUTE @ 0x20 : Data signal of uops_V
-    if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
-    // COMPUTE @ 0x28 : Data signal of biases_V
-    if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
-    // STORE @ 0x10 : Data signal of outputs_V
-    if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
-
-    // VTA start
-    VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
-    VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
-    VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
-    VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
-
-    int flag = 0, t = 0;
-    for (t = 0; t < 10000000; ++t) {
-      flag = VTAReadMappedReg(vta_compute_handle, 0x18);
-      if (flag & VTA_DONE) break;
-    }
-
-    if (t==10000000) {
-        printf("\tWARNING: VTA TIMEOUT!!!!\n");
-    }
-#if DEBUG==1
-    else {
-        printf("INFO - FPGA Finished!\n");
-    }
+  clock_gettime(CLOCK_REALTIME, &start);
+
+  // FETCH @ 0x10 : Data signal of insn_count_V
+  VTAWriteMappedReg(vta_fetch_handle, 0x10, insn_count);
+  // FETCH @ 0x18 : Data signal of insns_V
+  if (insns) VTAWriteMappedReg(vta_fetch_handle, 0x18, insn_phy);
+  // LOAD @ 0x10 : Data signal of inputs_V
+  if (inputs) VTAWriteMappedReg(vta_load_handle, 0x10, input_phy);
+  // LOAD @ 0x18 : Data signal of weight_V
+  if (weights) VTAWriteMappedReg(vta_load_handle, 0x18, weight_phy);
+  // COMPUTE @ 0x20 : Data signal of uops_V
+  if (uops) VTAWriteMappedReg(vta_compute_handle, 0x20, uop_phy);
+  // COMPUTE @ 0x28 : Data signal of biases_V
+  if (biases) VTAWriteMappedReg(vta_compute_handle, 0x28, bias_phy);
+  // STORE @ 0x10 : Data signal of outputs_V
+  if (outputs) VTAWriteMappedReg(vta_store_handle, 0x10, output_phy);
+
+  // VTA start
+  VTAWriteMappedReg(vta_fetch_handle, 0x0, 0x1);
+  VTAWriteMappedReg(vta_load_handle, 0x0, 0x81);
+  VTAWriteMappedReg(vta_compute_handle, 0x0, 0x81);
+  VTAWriteMappedReg(vta_store_handle, 0x0, 0x81);
+
+  int flag = 0, t = 0;
+  for (t = 0; t < 10000000; ++t) {
+    flag = VTAReadMappedReg(vta_compute_handle, 0x18);
+    if (flag & VTA_DONE) break;
+  }
+
+  if (t == 10000000) {
+    printf("\tWARNING: VTA TIMEOUT!!!!\n");
+#if VTA_DEBUG == 1
+  } else {
+    printf("INFO - FPGA Finished!\n");
 #endif
+  }
 
-    clock_gettime(CLOCK_REALTIME, &stop);
-    t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
+  clock_gettime(CLOCK_REALTIME, &stop);
+  t_fpga = 1000000000ULL * (stop.tv_sec - start.tv_sec) + (stop.tv_nsec - start.tv_nsec);
 
-    // Unmap VTA register
-    VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
-    VTAUnmapRegister(vta_load_handle, VTA_RANGE);
-    VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
-    VTAUnmapRegister(vta_store_handle, VTA_RANGE);
+  // Unmap VTA register
+  VTAUnmapRegister(vta_fetch_handle, VTA_RANGE);
+  VTAUnmapRegister(vta_load_handle, VTA_RANGE);
+  VTAUnmapRegister(vta_compute_handle, VTA_RANGE);
+  VTAUnmapRegister(vta_store_handle, VTA_RANGE);
 
-    return t_fpga;
-};
-
-int main(void)
-{
+  return t_fpga;
+}
 
-#if DEBUG==1
-    printParameters();
+int main(void) {
+#if VTA_DEBUG == 1
+  printParameters();
 #endif
 
-    int status = 0;
-
-    // Run ALU test (vector-scalar operators)
-    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, true);
-    status |= alu_test(ALU_OPCODE_MAX, true, 16, 128, false);
-    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, true);
-    status |= alu_test(ALU_OPCODE_ADD, true, 16, 128, false);
-    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, true);
-    status |= alu_test(ALU_OPCODE_SHR, true, 16, 128, false);
-
-    // Run ALU test (vector-vector operators)
-    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, true);
-    status |= alu_test(ALU_OPCODE_MAX, false, 16, 128, false);
-    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, true);
-    status |= alu_test(ALU_OPCODE_ADD, false, 16, 128, false);
-
-    // Run blocked GEMM test
-    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 2);
-    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 2);
-    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, true, 1);
-    status |= blocked_gemm_test(256, 256, BLOCK_OUT*4, false, 1);
-
-    if (status==0) {
-        printf("\nINFO - Unit tests successful!\n");
-    } else {
-        printf("\nINTO - Unit tests failed!\n");
-    }
-
-    return status;
-
+  int status = 0;
+
+  // Run ALU test (vector-scalar operators)
+  status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, true);
+  status |= alu_test(VTA_ALU_OPCODE_MAX, true, 16, 128, false);
+  status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, true);
+  status |= alu_test(VTA_ALU_OPCODE_ADD, true, 16, 128, false);
+  status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, true);
+  status |= alu_test(VTA_ALU_OPCODE_SHR, true, 16, 128, false);
+
+  // Run ALU test (vector-vector operators)
+  status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, true);
+  status |= alu_test(VTA_ALU_OPCODE_MAX, false, 16, 128, false);
+  status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, true);
+  status |= alu_test(VTA_ALU_OPCODE_ADD, false, 16, 128, false);
+
+  // Run blocked GEMM test
+  status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 2);
+  status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 2);
+  status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, true, 1);
+  status |= blocked_gemm_test(256, 256, VTA_BLOCK_OUT*4, false, 1);
+
+  if (status == 0) {
+    printf("\nINFO - Unit tests successful!\n");
+  } else {
+    printf("\nINTO - Unit tests failed!\n");
+  }
+
+  return status;
 }