From cdd7f37fefb417caba51ee5ad0c4c849b777f744 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Fri, 14 Dec 2018 13:32:15 -0800
Subject: [PATCH] [TOPI] NCHWc added input shape 4 condition, intel graphics
 conv2d schedule debugged for inception_v3 workloads (#2265)

---
 nnvm/python/nnvm/top/nn.py                |  9 ++-
 topi/python/topi/intel_graphics/conv2d.py | 83 ++++++++---------------
 2 files changed, 34 insertions(+), 58 deletions(-)

diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index 74196c078..a37a5d7e0 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name, unused-argument
+# pylint: disable=invalid-name, unused-argument, missing-docstring, no-else-return
 """Definition of nn ops"""
 from __future__ import absolute_import
 
@@ -170,8 +170,11 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
     out_layout = attrs.get_string("out_layout")
     out_dtype = attrs.get_string("out_dtype")
     out_dtype = inputs[0].dtype if out_dtype == "same" else out_dtype
-    _, in_channel_chunk, _, _, in_channel_block = get_const_tuple(inputs[0].shape)
-    in_channel = in_channel_chunk * in_channel_block
+    if layout == "NCHW":
+        _, in_channel, _, _ = get_const_tuple(inputs[0].shape)
+    else:
+        _, in_channel_chunk, _, _, in_channel_block = get_const_tuple(inputs[0].shape)
+        in_channel = in_channel_chunk * in_channel_block
     assert dilation == (1, 1), "not support dilate now"
     if groups == 1:
         # pylint: disable=assignment-from-no-return
diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
index f6767b68a..d712e7141 100644
--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py
@@ -1,4 +1,4 @@
-# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return, too-many-arguments, too-many-locals, too-many-statements, no-member, too-many-branches
+# pylint: disable=invalid-name,unused-variable,unused-argument,no-else-return, too-many-arguments, too-many-locals, too-many-statements, no-member, too-many-branches, too-many-boolean-expressions
 """conv2d schedule on Intel Graphics"""
 
 from __future__ import absolute_import as _abs
@@ -61,7 +61,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos):
     return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
 
 @conv2d_NCHWc.register(["intel_graphics"])
-def _decl_conv2d(data, kernel, stride, padding, layout, out_layout, out_dtype='float32'):
+def _decl_conv2d(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='float32'):
     """Conv2D operator for Intel Graphics backend.
 
     Parameters
@@ -126,8 +126,7 @@ def schedule_conv2d_NCHWc(outs):
             for tensor in op.input_tensors:
                 if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
-        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \
-           or "1_16" in op.tag:
+        if 'conv2d' in op.tag:
             _schedule_cl_spatialpack_NCHWc(s, op)
 
         scheduled_ops.append(op)
@@ -156,31 +155,30 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16
     ry = tvm.reduce_axis((0, kernel_h), name='ry')
     rx = tvm.reduce_axis((0, kernel_w), name='rx')
 
-    block_w = 0
-    block_h = 0
+    block_w = 1
+    block_h = 1
     if stride_h == 2:
         if num_filter + kernel_h == 515:
-            conv_tag = "4_4"
             block_h = 4
             block_w = 4
         else:
-            conv_tag = "4_5"
             block_h = 4
             block_w = 5
     elif kernel_h == 3:
         if num_filter == 512:
-            conv_tag = "2_7"
             block_h = 2
             block_w = 7
         else:
-            conv_tag = "2_14"
             block_h = 2
             block_w = 14
+    elif kernel_h == 7 and padding == 3 and stride == 1:
+        block_h = 3
+        block_w = 4
     else:
-        conv_tag = "1_16"
         block_h = 1
         block_w = 16
 
+    attrs = {'block_h': block_h, 'block_w' : block_w}
     c_h = out_height
     c_w = out_width
 
@@ -202,13 +200,13 @@ def _decl_cl_spatialpack_NCHWc(data, kernel, stride, padding, out_dtype='float16
           tvm.sum(
               temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
               kernel[ff, rc, ry, rx, vc].astype(out_dtype),
-              axis=[rc, ry, rx]), tag=conv_tag, name='conv')
+              axis=[rc, ry, rx]), name='conv', attrs=attrs)
 
     output = tvm.compute(
         oshape,
         lambda nn, ff, yy, xx:
         conv[nn][ff//nv][yy][xx][ff%nv],
-        name='output_unpack', tag=conv_tag)
+        name='output_unpack', tag='conv2d')
 
     return output
 
@@ -224,21 +222,10 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
 
     kernel_L = s.cache_read(kernel, "local", [conv_L])
     _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
-    if "1_16" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 1
-        OUTPUT_BLOCK_WIDTH = 16
-    elif "2_14" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 14
-    elif "2_7" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 7
-    elif "4_5" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 5
-    elif "4_4" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 4
+
+    attrs = s[conv].op.attrs
+    OUTPUT_BLOCK_HEIGHT = attrs['block_h']
+    OUTPUT_BLOCK_WIDTH = attrs['block_w']
 
     # schedule conv
     z_factor = 1
@@ -308,7 +295,7 @@ def _schedule_cl_spatialpack_NCHWc(s, op):
 
 
 @conv2d.register(["intel_graphics"])
-def decl_conv2d(data, kernel, stride, padding, layout='NCHW', out_dtype='float32'):
+def decl_conv2d(data, kernel, stride, padding, dilation, layout='NCHW', out_dtype='float32'):
     """Conv2D operator for Intel Graphics backend.
 
     Parameters
@@ -368,8 +355,7 @@ def schedule_conv2d_nchw(outs):
             for tensor in op.input_tensors:
                 if tensor.op.input_tensors and tensor.op not in scheduled_ops:
                     traverse(tensor.op)
-        if "4_5" in op.tag or "4_4" in op.tag or "2_7" in op.tag or "2_14" in op.tag \
-           or "1_16" in op.tag:
+        if 'conv2d' in op.tag:
             _schedule_cl_spatialpack(s, op)
 
         scheduled_ops.append(op)
@@ -396,31 +382,30 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
     ry = tvm.reduce_axis((0, kernel_h), name='ry')
     rx = tvm.reduce_axis((0, kernel_w), name='rx')
 
-    block_w = 0
-    block_h = 0
+    block_w = 1
+    block_h = 1
     if stride_h == 2:
         if num_filter + kernel_h == 515:
-            conv_tag = "4_4"
             block_h = 4
             block_w = 4
         else:
-            conv_tag = "4_5"
             block_h = 4
             block_w = 5
     elif kernel_h == 3:
         if num_filter == 512:
-            conv_tag = "2_7"
             block_h = 2
             block_w = 7
         else:
-            conv_tag = "2_14"
             block_h = 2
             block_w = 14
+    elif kernel_h == 7 and padding == 3 and stride == 1:
+        block_h = 3
+        block_w = 4
     else:
-        conv_tag = "1_16"
         block_h = 1
         block_w = 16
 
+    attrs = {'block_h': block_h, 'block_w' : block_w}
     c_h = out_height
     c_w = out_width
 
@@ -453,13 +438,13 @@ def _decl_cl_spatialpack(data, kernel, stride, padding, layout, out_dtype='float
           tvm.sum(
               temp[nn, rc, yy * stride_h + ry, xx * stride_w + rx].astype(out_dtype) *
               kernel_vec[ff, rc, ry, rx, vc].astype(out_dtype),
-              axis=[rc, ry, rx]), tag=conv_tag, name='conv')
+              axis=[rc, ry, rx]), name='conv', attrs=attrs)
 
     output = tvm.compute(
         oshape,
         lambda nn, ff, yy, xx:
         conv[nn][ff//nv][yy][xx][ff%nv],
-        name='output_unpack', tag=conv_tag)
+        name='output_unpack', tag='conv2d')
 
     return output
 
@@ -477,21 +462,9 @@ def _schedule_cl_spatialpack(s, op):
     kernel_L = s.cache_read(kernel_vec, "local", [conv_L])
     _, in_channel, temp_h, temp_w = [util.get_const_int(x) for x in temp.shape]
 
-    if "1_16" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 1
-        OUTPUT_BLOCK_WIDTH = 16
-    elif "2_14" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 14
-    elif "2_7" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 2
-        OUTPUT_BLOCK_WIDTH = 7
-    elif "4_5" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 5
-    elif "4_4" in s[conv].op.tag:
-        OUTPUT_BLOCK_HEIGHT = 4
-        OUTPUT_BLOCK_WIDTH = 4
+    attrs = s[conv].op.attrs
+    OUTPUT_BLOCK_HEIGHT = attrs['block_h']
+    OUTPUT_BLOCK_WIDTH = attrs['block_w']
 
     # schedule conv
     z_factor = 1
-- 
GitLab