From 8ca12d8708a83ee93d44b253225a93872685f89d Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Thu, 17 Aug 2017 15:11:38 -0700
Subject: [PATCH] Add tutorial for convolution in CUDA (#343)

---
 tutorials/python/opt_conv_cuda.py | 226 ++++++++++++++++++++++++++++++
 1 file changed, 226 insertions(+)
 create mode 100644 tutorials/python/opt_conv_cuda.py

diff --git a/tutorials/python/opt_conv_cuda.py b/tutorials/python/opt_conv_cuda.py
new file mode 100644
index 000000000..3b04b0722
--- /dev/null
+++ b/tutorials/python/opt_conv_cuda.py
@@ -0,0 +1,226 @@
+"""How to optimize convolution on GPU
+==================================
+**Author**: `Haichen Shen <https://homes.cs.washington.edu/~haichen/>`_
+
+In this tutorial, we will demonstrate how to write a high performance
+convolution implementation in TVM. We use square size input tensors and filters
+as an example, and assume the input to convolution has a large batch. In this
+example, we use a different layout to store the data in order to achieve better
+data locality. The buffer layout is HWCN, which stands for height, width,
+channel, batch.
+
+"""
+
+################################################################
+# Preparation and Algorithm
+# -------------------------
+#
+# We use the fixed size for input tensors with 256 channels and 14 x 14
+# dimensions. The batch size is 256. Convolution filters contain 512 filters
+# of size 3 x 3.  We use stride size 1 and padding size 1 for the
+# convolution. The following code defines the convolution algorithm in TVM.
+#
+
+import numpy as np
+import tvm
+
+# The sizes of inputs and filters
+batch = 256
+in_channel = 256
+out_channel = 512
+in_size = 14
+kernel = 3
+pad = 1
+stride = 1
+
+# Algorithm
+A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A')
+W = tvm.placeholder((kernel, kernel, in_channel, out_channel), name='W')
+out_size = (in_size - kernel + pad) // stride + 1
+# Pad input
+Apad = tvm.compute(
+    (in_size + pad, in_size + pad, in_channel, batch),
+    lambda yy, xx, cc, nn: tvm.select(
+        tvm.all(yy >= pad, yy - pad < in_size,
+                xx >= pad, xx - pad < in_size),
+        A[yy - pad, xx - pad, cc, nn], tvm.const(0.)),
+    name='Apad')
+# Create reduction variables
+rc = tvm.reduce_axis((0, in_channel), name='rc')
+ry = tvm.reduce_axis((0, kernel), name='ry')
+rx = tvm.reduce_axis((0, kernel), name='rx')
+# Compute the convolution
+B = tvm.compute(
+    (out_size, out_size, out_channel, batch),
+    lambda yy, xx, ff, nn: tvm.sum(
+        Apad[yy * stride + ry, xx * stride + rx, rc, nn] * W[ry, rx, rc, ff],
+        axis=[ry, rx, rc]),
+    name='B')
+
+
+###############################################################################
+# Memory Hierarchy
+# ----------------
+# 
+# We first specify the memory hierarchy for buffers. The figure below shows the
+# GPU memory hierarchy. One important difference from CPU memory hierarchy is
+# that GPU provides a cache buffer called shared memory, which is managed by
+# programmers. Thus how to maximize the data reuse in the shared memory is
+# critical to achieve high performance in GPU kernels.
+#
+# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/gpu_memory_hierarchy.png
+#      :align: center
+#      :height: 319px
+#      :width: 271px
+#
+# In this example, we load both Apad and W into buffer AA and WW, which are
+# stored in the shared memory. These bufferes will be later shared by all
+# threads within the same thread block to compute the convolution. Each thread
+# then loads its own part from shared buffer into their local registers, AL and
+# WL. BL is a local cache of output B, which is also stored in the thread local
+# registers.
+#
+
+# Designate the memory hierarchy
+s = tvm.create_schedule(B.op)
+s[Apad].compute_inline() # compute Apad inline
+AA = s.cache_read(Apad, 'shared', [B])
+WW = s.cache_read(W, "shared", [B])
+AL = s.cache_read(AA, "local", [B])
+WL = s.cache_read(WW, "local", [B])
+BL = s.cache_write(B, "local")
+
+###############################################################################
+# Blocking
+# --------
+#
+# The following code splits the workload into thread blocks and individual
+# threads. We follow the blocking scheme in the matrix multiply. As shown in the
+# figure below, given a pixel coordinate (y, x), a thread block is responsible
+# for computing a region of block_factor x block_factor (64 x 64) for output
+# channels and batch. Due to the limit of shared memory space, we only load step
+# x block_factor (8 x 64) data from Apad and B each time to buffers in the
+# shared memory.
+#
+# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/conv_gpu_blocking.png
+#      :align: center
+#      :height: 308px
+#      :width: 317px
+#
+
+# tile consts
+tile = 8
+num_thread = 8
+block_factor = tile * num_thread
+step = 8
+vthread = 2
+
+# Get the GPU thread indices
+block_x = tvm.thread_axis("blockIdx.x")
+block_y = tvm.thread_axis("blockIdx.y")
+block_z = tvm.thread_axis("blockIdx.z")
+thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x")
+thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y")
+thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx")
+thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy")
+
+# Split the workloads
+hi, wi, fi, ni = s[B].op.axis
+bz = s[B].fuse(hi, wi)
+by, fi = s[B].split(fi, factor=block_factor)
+bx, ni = s[B].split(ni, factor=block_factor)
+
+# Bind the iteration variables to GPU thread indices
+s[B].bind(bz, block_z)
+s[B].bind(by, block_y)
+s[B].bind(bx, block_x)
+
+###############################################################################
+# Virtual Thread Split
+# --------------------
+#
+# We further split the workload from a thread block to individual threads. To
+# avoid *memory bank conflict*, we use virtual thread to split the area into 4
+# parts, and then tile into 8x8 grids. Therefore, shown in the figure below,
+# each thread computes 4 strided grids, where size of each grid is 4 x 4.
+#
+# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/conv_gpu_vthread.png
+#      :align: center
+#      :height: 188px
+#      :width: 268px
+#
+
+tyz, fi = s[B].split(fi, nparts=vthread)  # virtual thread split
+txz, ni = s[B].split(ni, nparts=vthread)  # virtual thread split
+ty, fi = s[B].split(fi, nparts=num_thread)
+tx, ni = s[B].split(ni, nparts=num_thread)
+s[B].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni)
+
+s[B].bind(tyz, thread_yz)
+s[B].bind(txz, thread_xz)
+s[B].bind(ty, thread_y)
+s[B].bind(tx, thread_x)
+
+###############################################################################
+# Cooperative Fetching
+# --------------------
+#
+# As mentioned before, each time step we need to transfer step x block_factor
+# data from GPU global memory to shared memory. In order to reduce the memory
+# transfer per thread, the following code lets threads in the same thread block
+# coopertively fetch dependent data from global memory.
+#
+
+
+# Schedule BL local write
+s[BL].compute_at(s[B], tx)
+yi, xi, fi, ni = s[BL].op.axis
+ry, rx, rc = s[BL].op.reduce_axis
+rco, rci = s[BL].split(rc, factor=step)
+s[BL].reorder(rco, ry, rx, rci, fi, ni)
+
+# Attach computation to iteration variables
+s[AA].compute_at(s[BL], rx)
+s[WW].compute_at(s[BL], rx)
+s[AL].compute_at(s[BL], rci)
+s[WL].compute_at(s[BL], rci)
+
+# Schedule for A's shared memory load
+yi, xi, ci, ni = s[AA].op.axis
+ty, ci = s[AA].split(ci, nparts=num_thread)
+tx, ni = s[AA].split(ni, nparts=num_thread)
+_, ni = s[AA].split(ni, factor=4)
+s[AA].reorder(ty, tx, yi, xi, ci, ni)
+s[AA].bind(ty, thread_y)
+s[AA].bind(tx, thread_x)
+s[AA].vectorize(ni)  # vectorize memory load
+
+# Schedule for W's shared memory load
+yi, xi, ci, fi = s[WW].op.axis
+ty, ci = s[WW].split(ci, nparts=num_thread)
+tx, fi = s[WW].split(fi, nparts=num_thread)
+_, fi = s[WW].split(fi, factor=4)
+s[WW].reorder(ty, tx, yi, xi, ci, fi)
+s[WW].bind(ty, thread_y)
+s[WW].bind(tx, thread_x)
+s[WW].vectorize(fi)  # vectorize memory load
+
+
+###############################################################################
+# Generate CUDA Kernel
+# --------------------
+#
+# Finally we use TVM to generate and compile the CUDA kernel, and evaluate the
+# latency of convolution.
+#
+
+func = tvm.build(s, [A, W, B], 'cuda')
+ctx = tvm.gpu(0)
+a_np = np.random.uniform(size=(in_size, in_size, in_channel, batch)).astype(A.dtype)
+w_np = np.random.uniform(size=(kernel, kernel, in_channel, out_channel)).astype(W.dtype)
+a = tvm.nd.array(a_np, ctx)
+w = tvm.nd.array(w_np, ctx)
+b = tvm.nd.array(np.zeros((out_size, out_size, out_channel, batch), dtype=B.dtype), ctx)
+func(a, w, b)
+evaluator = func.time_evaluator(func.entry_name, ctx, number=1)
+print('Convolution: %f ms' % (evaluator(a, w, b).mean * 1e3))
-- 
GitLab