diff --git a/vta/examples/resnet18/pynq/imagenet_predict.py b/vta/examples/resnet18/pynq/imagenet_predict.py
index 5ff6cc626d6953dd30c78a181910a7c9009650e9..5fce83231c5c5cf31d2450b664cae2660b072bf6 100644
--- a/vta/examples/resnet18/pynq/imagenet_predict.py
+++ b/vta/examples/resnet18/pynq/imagenet_predict.py
@@ -20,7 +20,7 @@ verbose = False
 # only run fpga component, mark non-conv ops as nop
 debug_fpga_only = False
 
-# Obtain model and hardware files (they're too large to check-in)
+# Obtain model files (they're too large to check-in)
 # Download them into _data dir
 data_dir = "_data/"
 url = "https://homes.cs.washington.edu/~moreau/media/vta/"
@@ -115,11 +115,6 @@ sym = vta.graph.clean_conv_fuse(sym)
 if target.device_name == "vta":
     sym = vta.graph.pack(sym, shape_dict, bfactor, cfactor)
 
-graph_attr.set_shape_inputs(sym, shape_dict)
-sym = sym.apply("InferShape")
-graph_attr.set_dtype_inputs(sym, dtype_dict)
-sym = sym.apply("InferType")
-
 with nnvm.compiler.build_config(opt_level=3):
     if target.device_name != "vta":
         graph, lib, params = nnvm.compiler.build(
diff --git a/vta/tests/python/integration/test_benchmark_topi_conv2d.py b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
index 24d9968dc31a41fc246e4c22d92a1b996306643d..b70fc9cfbf8535fa3fea684a46318d6d1276b59d 100644
--- a/vta/tests/python/integration/test_benchmark_topi_conv2d.py
+++ b/vta/tests/python/integration/test_benchmark_topi_conv2d.py
@@ -38,7 +38,7 @@ def test_vta_conv2d():
         res_conv = vta.top.packed_conv2d(
             data, kernel, padding=(wl.hpad, wl.wpad), strides=(wl.hstride, wl.wstride))
         res = topi.right_shift(res_conv, 8)
-        res = topi.add(res, bias)
+        res = topi.broadcast_add(res, bias)
         res = my_clip(res, 0, 127)
         res = topi.cast(res, "int8")
 
diff --git a/vta/tutorials/convolution_opt.py b/vta/tutorials/convolution_opt.py
new file mode 100644
index 0000000000000000000000000000000000000000..62efe8c0156bf08459fd0b5621bb86deece6d03b
--- /dev/null
+++ b/vta/tutorials/convolution_opt.py
@@ -0,0 +1,427 @@
+"""
+2D Convolution Optimization
+===========================
+**Author**: `Thierry Moreau <https://homes.cs.washington.edu/~moreau/>`_
+
+This tutorial provides an overview on how to use TVM to map a 2D convolution
+workload efficiently on the VTA design.
+We recommend covering the :ref:`mat-mult-opt` tutorial first.
+
+2D convolution is dominant in most computer vision deep neural networks.
+In this tutorial, we will demonstrate TVM schedule optimizations to map
+2D convolution operators in NCHW layout onto VTA.
+We also introduce the notion of latency hiding, which allows us to
+maximize VTA's compute and memory resource utilization.
+"""
+
+######################################################################
+# RPC Setup
+# ---------
+# We start by programming the Pynq's FPGA and building its RPC runtime.
+
+from __future__ import absolute_import, print_function
+
+import os
+import tvm
+import vta
+import numpy as np
+
+from tvm.contrib import rpc, util
+from vta.testing import simulator
+
+# Load VTA parameters from the config.json file
+env = vta.get_env()
+
+# We read the Pynq RPC host IP address and port number from the OS environment
+host = os.environ.get("VTA_PYNQ_RPC_HOST", "192.168.2.99")
+port = int(os.environ.get("VTA_PYNQ_RPC_PORT", "9091"))
+
+# We configure both the bitstream and the runtime system on the Pynq
+# to match the VTA configuration specified by the config.json file.
+if env.TARGET == "pynq":
+
+    # Make sure that TVM was compiled with RPC=1
+    assert tvm.module.enabled("rpc")
+    remote = rpc.connect(host, port)
+
+    # Reconfigure the JIT runtime
+    vta.reconfig_runtime(remote)
+
+    # Program the FPGA with a pre-compiled VTA bitstream.
+    # You can program the FPGA with your own custom bitstream
+    # by passing the path to the bitstream file instead of None.
+    vta.program_fpga(remote, bitstream=None)
+
+# In simulation mode, host the RPC server locally.
+elif env.TARGET == "sim":
+    remote = rpc.LocalSession()
+
+######################################################################
+# Computation Declaration
+# -----------------------
+# As a first step, we need to describe our 2D convolution computation
+# in NCHW format.
+#
+# We define the 2D convolution shape by the batch size,
+# spatial dimensions, input channels, output channels, kernel dimensions,
+# kernel dimensions, padding dimensions, and stride dimensions.
+#
+# We pick the shape of the 9th convolutional layer of the ResNet-18
+# architecture as our convolution workload parameters.
+#
+# We've added extra operators to the 2D convolution that apply
+# shifting and clipping to the output in order to mimic a fixed-point
+# convolution followed by a rectified linear activation.
+# We describe the TVM dataflow graph of the 2D convolution layer below:
+#
+# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/conv2d_dataflow.png
+#      :align: center
+#
+# This computation is intentionally too large to fit onto VTA's on-chip
+# buffers all at once. Therefore in the scheduling phase we'll
+# rely on computation blocking strategies to break the computation down into
+# manageable chunks.
+#
+# .. note::
+#
+#   *Spatial padding*
+#
+#   Note that we'll need to import the TOPI library to apply spatial padding
+#   on the input feature map tensor.
+#   Spatial padding facilitates blocking in the context of 2D convolutions
+#   due to the fact that the same (x, y) spatial location of the input
+#   feature map of any given layer is read more than once if the convolution
+#   kernel window size is greater than one.
+#   On CPUs, and GPUs, one way to increase efficiency of memory accesses
+#   when parallelizing work is spatial packing, which requires data re-layout.
+#   VTA load DMA engine can insert padding automatically so that the original
+#   input feature map does not have to be re-packed in memory.
+#
+#   We show the effect of VTA's on the fly spatial padding when data is being
+#   loaded from DRAM into VTA's SRAM, following a 2D strided and padded memory
+#   read.
+#
+#   .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/padding.png
+#        :align: center
+#        :width: 480px
+
+import topi
+
+# 2D convolution layer dimensions taken from ResNet-18 architecture
+# (9th convolutional layer)
+batch_size = 1
+height = 14
+width = 14
+in_channels = 256
+out_channels = 256
+kernel_h = 3
+kernel_w = 3
+pad_h = 1
+pad_w = 1
+stride_h = 1
+stride_w = 1
+assert batch_size % env.BATCH == 0
+assert in_channels % env.BLOCK_IN == 0
+assert out_channels % env.BLOCK_OUT == 0
+
+# Input feature map: (N, IC, H, W, n, ic)
+data_shape = (batch_size // env.BATCH,
+              in_channels // env.BLOCK_IN,
+              height,
+              width,
+              env.BATCH,
+              env.BLOCK_IN)
+# Kernel: (OC, IC, H, W, oc, ic)
+kernel_shape = (out_channels // env.BLOCK_OUT,
+                in_channels // env.BLOCK_IN,
+                kernel_h,
+                kernel_w,
+                env.BLOCK_OUT,
+                env.BLOCK_IN)
+# Derive output feature map dimensions
+fout_height = (height + 2 * pad_h - kernel_h) // stride_h + 1
+fout_width = (width + 2 * pad_w - kernel_w) // stride_w + 1
+# Output feature map: (N, OC, H, W, n, oc)
+output_shape = (batch_size // env.BATCH,
+                out_channels // env.BLOCK_OUT,
+                fout_height,
+                fout_width,
+                env.BATCH,
+                env.BLOCK_OUT)
+
+# Convolution reduction axes
+dy = tvm.reduce_axis((0, kernel_h), name='dy')
+dx = tvm.reduce_axis((0, kernel_w), name='dx')
+ic = tvm.reduce_axis((0, in_channels // env.BLOCK_IN), name='ic')
+ic_tns = tvm.reduce_axis((0, env.BLOCK_IN), name='ic_tns')
+
+# Input placeholder tensors
+data = tvm.placeholder(data_shape,
+                       name="data",
+                       dtype=env.inp_dtype)
+kernel = tvm.placeholder(kernel_shape,
+                         name="kernel",
+                         dtype=env.wgt_dtype)
+
+# Copy buffers:
+#   Apply spatial padding to input feature map
+data_buf = topi.nn.pad(data,
+                       [0, 0, pad_h, pad_w, 0, 0],
+                       name="data_buf")
+kernel_buf = tvm.compute(kernel_shape, lambda *i: kernel(*i), "kernel_buf")
+
+# Declare 2D convolution
+res_conv = tvm.compute(
+    output_shape,
+    lambda bo, co, i, j, bi, ci: tvm.sum(
+      data_buf[bo, ic, i*stride_h+dy, j*stride_w+dx, bi, ic_tns].astype(env.acc_dtype) *
+      kernel_buf[co, ic, dy, dx, ci, ic_tns].astype(env.acc_dtype),
+    axis=[ic, dy, dx, ic_tns]),
+    name="res_conv")
+
+# Add shift stage for fix-point normalization
+res_shr = tvm.compute(output_shape,
+                      lambda *i: res_conv(*i) >> 8,
+                      name="res_shr")
+
+# Apply clipping between (0, input max value)
+inp_max = (1 << (env.INP_WIDTH - 1)) - 1
+res_max = tvm.compute(output_shape,
+                      lambda *i: tvm.max(res_shr(*i), 0),
+                      "res_max")
+res_min = tvm.compute(output_shape,
+                      lambda *i: tvm.min(res_max(*i), inp_max),
+                      "res_min")
+
+# Result Tensor
+res = tvm.compute(output_shape,
+                  lambda *i: res_min(*i).astype(env.inp_dtype),
+                  name="res")
+
+
+######################################################################
+# Scheduling the Computation
+# --------------------------
+# We'll look at a set of schedule transformations necessary to map the
+# 2D convolution onto VTA in an efficient fashion.
+# Those include:
+#
+# - Computation blocking
+# - Virtual threading to increase compute utilization
+# - Lowering to VTA hardware intrinsics
+
+# Create TVM schedule
+s = tvm.create_schedule(res.op)
+# Let's look at the default TVM schedule
+print(tvm.lower(s, [data, kernel, res], simple_mode=True))
+
+######################################################################
+# Blocking the Computation
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+# The 2D convolution is by default too large for activations or kernel weights
+# to fit on VTA's on-chip buffers all at once.
+# We apply blocking along input channels, output channels, and along
+# the height spatial dimensions.
+# We don't apply blocking along the width spatial dimension since it's
+# the innermost dimension in the NCHW layout (and consequently to increase
+# locality, it's best not to block along the innermost dimension).
+
+# Let's define tiling sizes
+b_block = 1 // env.BATCH
+oc_block = 128 // env.BLOCK_OUT
+ic_block = 16 // env.BLOCK_IN
+h_block = 7
+w_block = 14
+
+# Tile the output tensor along the spatial and output channel dimensions
+# (since by default we are doing single batch inference, the split along
+#  the batch dimension has no effect)
+b, oc, y, x, b_tns, oc_tns = s[res].op.axis
+b_out, b_inn = s[res].split(b, factor=b_block)
+oc_out, oc_inn = s[res].split(oc, factor=oc_block)
+y_out, y_inn = s[res].split(y, factor=h_block)
+x_out, x_inn = s[res].split(x, factor=w_block)
+s[res].reorder(b_out, oc_out, y_out, x_out, b_inn, oc_inn, y_inn, x_inn, b_tns, oc_tns)
+
+# Move intermediate computation into each output compute tile
+s[res_conv].compute_at(s[res], x_out)
+s[res_shr].compute_at(s[res], x_out)
+s[res_max].compute_at(s[res], x_out)
+s[res_min].compute_at(s[res], x_out)
+
+# Apply additional loop split along reduction axis (input channel)
+b_inn, oc_inn, y_inn, x_inn, b_tns, oc_tns = s[res_conv].op.axis
+ic_out, ic_inn = s[res_conv].split(ic, factor=ic_block)
+
+# Reorder axes.
+# 1) Group the VTA tensor axes in the inner most position: b_tns, oc_tns, ic_tns
+#    to allow TVM to tensorize.
+# 2) We move the ic_out axis all the way out of the convolution loop to block
+#    along the reduction axis.
+# 3) Now we re-order the block axes: b_inn, oc_inn, y_inn, x_inn, ic_inn, dy, dx.
+#    VTA runtime/hardware requires us to write to a different output feature map
+#    location for every VTA tensor operation.
+#    This restriction requires us to order one of oc_inn, y_inn or x_inn right
+#    before b_tns, since they all affect output feature map indexing.
+#    Therefore, we choose to bring x_inn inside as shown below.
+s[res_conv].reorder(ic_out, b_inn, oc_inn, y_inn, ic_inn, dy, dx, x_inn, b_tns, oc_tns, ic_tns)
+
+######################################################################
+# Virtual Threading
+# ~~~~~~~~~~~~~~~~~
+# Virtual threading is a mechanism that increases task-level pipeline
+# parallelism in the VTA hardware design.
+# Put it another way, it increases compute resource utilization by hiding
+# memory access latency.
+#
+# In the implementation below, virtual threading distributes work across two
+# threads split along the output channel axis.
+# We show how work is split when computing the 2D convolution in the figure
+# below.
+#
+# .. image:: https://raw.githubusercontent.com/uwsaml/web-data/master/vta/tutorial/virtual_threading.png
+#      :align: center
+#      :width: 480px
+
+# VTA only supports 2 virtual threads
+v_threads = 2
+
+# Perform virtual thread split along output channel outer axis
+_, tx = s[res].split(oc_out, factor=v_threads)
+s[res].reorder(tx, b_out)
+s[res].bind(tx, tvm.thread_axis("cthread"))
+
+# Let's look at the current TVM schedule after blocking and virtual threading
+print(tvm.lower(s, [data, kernel, res], simple_mode=True))
+
+######################################################################
+# Lowering Copies to DMA Transfers
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Next we set the buffer scopes to the corresponding on-chip VTA SRAM buffers.
+# We move the load loops into the 2D convolution computation loop to stage
+# memory loads such that they fit in the on-chip SRAM buffers.
+# Finally we annotate the load/store loop outer axes with the DMA copy pragma
+# to perform bulk memory transfers on VTA.
+
+# Set scope of SRAM buffers
+s[data_buf].set_scope(env.inp_scope)
+s[kernel_buf].set_scope(env.wgt_scope)
+s[res_conv].set_scope(env.acc_scope)
+s[res_shr].set_scope(env.acc_scope)
+s[res_min].set_scope(env.acc_scope)
+s[res_max].set_scope(env.acc_scope)
+
+# Block data and kernel cache reads
+s[data_buf].compute_at(s[res_conv], ic_out)
+s[kernel_buf].compute_at(s[res_conv], ic_out)
+
+# Use DMA copy pragma on DRAM->SRAM operations
+s[data_buf].pragma(s[data_buf].op.axis[0], env.dma_copy)
+s[kernel_buf].pragma(s[kernel_buf].op.axis[0], env.dma_copy)
+
+# Use DMA copy pragma on SRAM->DRAM operation in each result block
+# (this implies that these copies should be performed along b_inn,
+# or result axis 4)
+s[res].pragma(s[res].op.axis[4], env.dma_copy)
+
+######################################################################
+# Lowering Computation to VTA Compute Intrinsics
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# The last phase is to lower the computation loops down to VTA hardware
+# intrinsics by mapping the 2D convolution to tensor intrinsics,
+# and mapping the shift, and clipping computation to the vector ALU.
+
+# Apply tensorization over the batch tensor tile axis
+s[res_conv].tensorize(b_tns, env.gemm)
+
+# Add an ALU pragma over the shift and clipping operations
+s[res_shr].pragma(s[res_shr].op.axis[0], env.alu)
+s[res_min].pragma(s[res_min].op.axis[0], env.alu)
+s[res_max].pragma(s[res_max].op.axis[0], env.alu)
+
+# Let's look at the final lowered TVM schedule after lowering memory
+# loads/stores down to DMA copy intrinsics, and the computation down to
+# VTA compute intrinsics.
+print(vta.lower(s, [data, kernel, res], simple_mode=True))
+
+######################################################################
+# TVM Compilation and Verification
+# --------------------------------
+# After specifying the schedule, we can compile it into a TVM function.
+# We save the module so we can send it over RPC.
+# We run the function and verify it against a numpy implementation to
+# ensure correctness.
+
+# This library facilitates 2D convolution testing
+from topi.testing import conv2d_nchw_python
+
+# Compile the TVM module
+my_conv = vta.build(s, [data, kernel, res], "ext_dev", env.target_host, name="my_conv")
+temp = util.tempdir()
+my_conv.save(temp.relpath("conv2d.o"))
+remote.upload(temp.relpath("conv2d.o"))
+f = remote.load_module("conv2d.o")
+
+# Get the remote device context
+ctx = remote.ext_dev(0)
+
+# Initialize the data and kernel arrays randomly in the int range
+# of (-128, 128] in NCHW layout
+data_np = np.random.randint(
+    -128, 128,
+    size=(batch_size, in_channels, height, width)).astype(data.dtype)
+kernel_np = np.random.randint(
+    -128, 128,
+    size=(out_channels, in_channels, kernel_h, kernel_w)).astype(kernel.dtype)
+
+# Apply packing to the data and kernel arrays from a 2D NCHW
+# to a 4D NCHWnc packed layout
+data_packed = data_np.reshape(batch_size // env.BATCH,
+                              env.BATCH,
+                              in_channels // env.BLOCK_IN,
+                              env.BLOCK_IN,
+                              height,
+                              width).transpose((0, 2, 4, 5, 1, 3))
+
+kernel_packed = kernel_np.reshape(out_channels // env.BLOCK_OUT,
+                                  env.BLOCK_OUT,
+                                  in_channels // env.BLOCK_IN,
+                                  env.BLOCK_IN,
+                                  kernel_h,
+                                  kernel_w).transpose((0, 2, 4, 5, 1, 3))
+
+# Format the input/output arrays with tvm.nd.array to the DLPack standard
+data_nd = tvm.nd.array(data_packed, ctx)
+kernel_nd = tvm.nd.array(kernel_packed, ctx)
+res_nd = tvm.nd.array(np.zeros(output_shape).astype(res.dtype), ctx)
+
+# Invoke the module to perform the computation
+f(data_nd, kernel_nd, res_nd)
+
+# Verify against numpy implementation
+res_ref = conv2d_nchw_python(data_np.astype(env.acc_dtype),
+                            kernel_np.astype(env.acc_dtype),
+                            (stride_h, stride_w),
+                            (pad_h, pad_w)).astype(env.acc_dtype)
+res_ref = res_ref >> env.INP_WIDTH
+res_ref = np.clip(res_ref, 0, inp_max)
+res_ref = res_ref.astype(res.dtype)
+res_ref = res_ref.reshape((batch_size // env.BATCH,
+                           env.BATCH,
+                           out_channels // env.BLOCK_OUT,
+                           env.BLOCK_OUT,
+                           fout_height,
+                           fout_width)).transpose((0, 2, 4, 5, 1, 3))
+np.testing.assert_allclose(res_ref, res_nd.asnumpy())
+print("Successful 2D convolution test!")
+
+######################################################################
+# Summary
+# -------
+# This tutorial demonstrates how TVM scheduling primitives can be used to
+# lower 2D convolution onto hardware accelerator intrinsics, making
+# use of hardware specific optimizations, such as latency hiding with
+# virtual threading.
+#
+
+