From 8ca12d8708a83ee93d44b253225a93872685f89d Mon Sep 17 00:00:00 2001 From: Haichen Shen <shenhaichen@gmail.com> Date: Thu, 17 Aug 2017 15:11:38 -0700 Subject: [PATCH] Add tutorial for convolution in CUDA (#343) --- tutorials/python/opt_conv_cuda.py | 226 ++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 tutorials/python/opt_conv_cuda.py diff --git a/tutorials/python/opt_conv_cuda.py b/tutorials/python/opt_conv_cuda.py new file mode 100644 index 000000000..3b04b0722 --- /dev/null +++ b/tutorials/python/opt_conv_cuda.py @@ -0,0 +1,226 @@ +"""How to optimize convolution on GPU +================================== +**Author**: `Haichen Shen <https://homes.cs.washington.edu/~haichen/>`_ + +In this tutorial, we will demonstrate how to write a high performance +convolution implementation in TVM. We use square size input tensors and filters +as an example, and assume the input to convolution has a large batch. In this +example, we use a different layout to store the data in order to achieve better +data locality. The buffer layout is HWCN, which stands for height, width, +channel, batch. + +""" + +################################################################ +# Preparation and Algorithm +# ------------------------- +# +# We use the fixed size for input tensors with 256 channels and 14 x 14 +# dimensions. The batch size is 256. Convolution filters contain 512 filters +# of size 3 x 3. We use stride size 1 and padding size 1 for the +# convolution. The following code defines the convolution algorithm in TVM. +# + +import numpy as np +import tvm + +# The sizes of inputs and filters +batch = 256 +in_channel = 256 +out_channel = 512 +in_size = 14 +kernel = 3 +pad = 1 +stride = 1 + +# Algorithm +A = tvm.placeholder((in_size, in_size, in_channel, batch), name='A') +W = tvm.placeholder((kernel, kernel, in_channel, out_channel), name='W') +out_size = (in_size - kernel + pad) // stride + 1 +# Pad input +Apad = tvm.compute( + (in_size + pad, in_size + pad, in_channel, batch), + lambda yy, xx, cc, nn: tvm.select( + tvm.all(yy >= pad, yy - pad < in_size, + xx >= pad, xx - pad < in_size), + A[yy - pad, xx - pad, cc, nn], tvm.const(0.)), + name='Apad') +# Create reduction variables +rc = tvm.reduce_axis((0, in_channel), name='rc') +ry = tvm.reduce_axis((0, kernel), name='ry') +rx = tvm.reduce_axis((0, kernel), name='rx') +# Compute the convolution +B = tvm.compute( + (out_size, out_size, out_channel, batch), + lambda yy, xx, ff, nn: tvm.sum( + Apad[yy * stride + ry, xx * stride + rx, rc, nn] * W[ry, rx, rc, ff], + axis=[ry, rx, rc]), + name='B') + + +############################################################################### +# Memory Hierarchy +# ---------------- +# +# We first specify the memory hierarchy for buffers. The figure below shows the +# GPU memory hierarchy. One important difference from CPU memory hierarchy is +# that GPU provides a cache buffer called shared memory, which is managed by +# programmers. Thus how to maximize the data reuse in the shared memory is +# critical to achieve high performance in GPU kernels. +# +# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/gpu_memory_hierarchy.png +# :align: center +# :height: 319px +# :width: 271px +# +# In this example, we load both Apad and W into buffer AA and WW, which are +# stored in the shared memory. These bufferes will be later shared by all +# threads within the same thread block to compute the convolution. Each thread +# then loads its own part from shared buffer into their local registers, AL and +# WL. BL is a local cache of output B, which is also stored in the thread local +# registers. +# + +# Designate the memory hierarchy +s = tvm.create_schedule(B.op) +s[Apad].compute_inline() # compute Apad inline +AA = s.cache_read(Apad, 'shared', [B]) +WW = s.cache_read(W, "shared", [B]) +AL = s.cache_read(AA, "local", [B]) +WL = s.cache_read(WW, "local", [B]) +BL = s.cache_write(B, "local") + +############################################################################### +# Blocking +# -------- +# +# The following code splits the workload into thread blocks and individual +# threads. We follow the blocking scheme in the matrix multiply. As shown in the +# figure below, given a pixel coordinate (y, x), a thread block is responsible +# for computing a region of block_factor x block_factor (64 x 64) for output +# channels and batch. Due to the limit of shared memory space, we only load step +# x block_factor (8 x 64) data from Apad and B each time to buffers in the +# shared memory. +# +# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/conv_gpu_blocking.png +# :align: center +# :height: 308px +# :width: 317px +# + +# tile consts +tile = 8 +num_thread = 8 +block_factor = tile * num_thread +step = 8 +vthread = 2 + +# Get the GPU thread indices +block_x = tvm.thread_axis("blockIdx.x") +block_y = tvm.thread_axis("blockIdx.y") +block_z = tvm.thread_axis("blockIdx.z") +thread_x = tvm.thread_axis((0, num_thread), "threadIdx.x") +thread_y = tvm.thread_axis((0, num_thread), "threadIdx.y") +thread_xz = tvm.thread_axis((0, vthread), "vthread", name="vx") +thread_yz = tvm.thread_axis((0, vthread), "vthread", name="vy") + +# Split the workloads +hi, wi, fi, ni = s[B].op.axis +bz = s[B].fuse(hi, wi) +by, fi = s[B].split(fi, factor=block_factor) +bx, ni = s[B].split(ni, factor=block_factor) + +# Bind the iteration variables to GPU thread indices +s[B].bind(bz, block_z) +s[B].bind(by, block_y) +s[B].bind(bx, block_x) + +############################################################################### +# Virtual Thread Split +# -------------------- +# +# We further split the workload from a thread block to individual threads. To +# avoid *memory bank conflict*, we use virtual thread to split the area into 4 +# parts, and then tile into 8x8 grids. Therefore, shown in the figure below, +# each thread computes 4 strided grids, where size of each grid is 4 x 4. +# +# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/conv_gpu_vthread.png +# :align: center +# :height: 188px +# :width: 268px +# + +tyz, fi = s[B].split(fi, nparts=vthread) # virtual thread split +txz, ni = s[B].split(ni, nparts=vthread) # virtual thread split +ty, fi = s[B].split(fi, nparts=num_thread) +tx, ni = s[B].split(ni, nparts=num_thread) +s[B].reorder(bz, by, bx, tyz, txz, ty, tx, fi, ni) + +s[B].bind(tyz, thread_yz) +s[B].bind(txz, thread_xz) +s[B].bind(ty, thread_y) +s[B].bind(tx, thread_x) + +############################################################################### +# Cooperative Fetching +# -------------------- +# +# As mentioned before, each time step we need to transfer step x block_factor +# data from GPU global memory to shared memory. In order to reduce the memory +# transfer per thread, the following code lets threads in the same thread block +# coopertively fetch dependent data from global memory. +# + + +# Schedule BL local write +s[BL].compute_at(s[B], tx) +yi, xi, fi, ni = s[BL].op.axis +ry, rx, rc = s[BL].op.reduce_axis +rco, rci = s[BL].split(rc, factor=step) +s[BL].reorder(rco, ry, rx, rci, fi, ni) + +# Attach computation to iteration variables +s[AA].compute_at(s[BL], rx) +s[WW].compute_at(s[BL], rx) +s[AL].compute_at(s[BL], rci) +s[WL].compute_at(s[BL], rci) + +# Schedule for A's shared memory load +yi, xi, ci, ni = s[AA].op.axis +ty, ci = s[AA].split(ci, nparts=num_thread) +tx, ni = s[AA].split(ni, nparts=num_thread) +_, ni = s[AA].split(ni, factor=4) +s[AA].reorder(ty, tx, yi, xi, ci, ni) +s[AA].bind(ty, thread_y) +s[AA].bind(tx, thread_x) +s[AA].vectorize(ni) # vectorize memory load + +# Schedule for W's shared memory load +yi, xi, ci, fi = s[WW].op.axis +ty, ci = s[WW].split(ci, nparts=num_thread) +tx, fi = s[WW].split(fi, nparts=num_thread) +_, fi = s[WW].split(fi, factor=4) +s[WW].reorder(ty, tx, yi, xi, ci, fi) +s[WW].bind(ty, thread_y) +s[WW].bind(tx, thread_x) +s[WW].vectorize(fi) # vectorize memory load + + +############################################################################### +# Generate CUDA Kernel +# -------------------- +# +# Finally we use TVM to generate and compile the CUDA kernel, and evaluate the +# latency of convolution. +# + +func = tvm.build(s, [A, W, B], 'cuda') +ctx = tvm.gpu(0) +a_np = np.random.uniform(size=(in_size, in_size, in_channel, batch)).astype(A.dtype) +w_np = np.random.uniform(size=(kernel, kernel, in_channel, out_channel)).astype(W.dtype) +a = tvm.nd.array(a_np, ctx) +w = tvm.nd.array(w_np, ctx) +b = tvm.nd.array(np.zeros((out_size, out_size, out_channel, batch), dtype=B.dtype), ctx) +func(a, w, b) +evaluator = func.time_evaluator(func.entry_name, ctx, number=1) +print('Convolution: %f ms' % (evaluator(a, w, b).mean * 1e3)) -- GitLab