diff --git a/topi/python/topi/cuda/depthwise_conv2d.py b/topi/python/topi/cuda/depthwise_conv2d.py index d0cfde03f3d95996e879971aa397a1f89b79e20c..b92ccba1ff05337b3cb2d373c03debe11c407aff 100644 --- a/topi/python/topi/cuda/depthwise_conv2d.py +++ b/topi/python/topi/cuda/depthwise_conv2d.py @@ -20,7 +20,10 @@ def schedule_depthwise_conv2d_nchw(outs): outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs s = tvm.create_schedule([x.op for x in outs]) def _schedule(PaddedInput, Filter, DepthwiseConv2d): + in_shape = get_const_tuple(PaddedInput.shape) out_shape = get_const_tuple(DepthwiseConv2d.shape) + in_height = in_shape[2] + in_width = in_shape[3] out_height = out_shape[2] out_width = out_shape[3] channel_multiplier = get_const_tuple(Filter.shape)[1] @@ -42,12 +45,14 @@ def schedule_depthwise_conv2d_nchw(outs): num_vthread_x = 1 blocking_h = out_height blocking_w = out_width - if out_height % 32 == 0: + if out_height % 32 == 0 or in_height >= 108: blocking_h = 32 if out_width % 32 == 0: blocking_w = 32 num_thread_x = 16 num_vthread_x = 2 + elif in_width >= 108: + blocking_w = 32 block_y = tvm.thread_axis("blockIdx.y") block_x = tvm.thread_axis("blockIdx.x") thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y") diff --git a/topi/python/topi/nn/convolution.py b/topi/python/topi/nn/convolution.py index 4fadb03dc7028ba399c068391977c86b5fb40008..d6a3c4ca6fdf88bd044ba1b7b511637f57ad9083 100644 --- a/topi/python/topi/nn/convolution.py +++ b/topi/python/topi/nn/convolution.py @@ -121,8 +121,8 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding): stride : tuple of two ints The spatial stride along height and width - padding : str - 'VALID' or 'SAME' + padding : int or str + Padding size, or ['VALID', 'SAME'] Returns ------- @@ -169,8 +169,8 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding): Stride : tvm.Tensor 1-D of size 2 - padding : str - 'VALID' or 'SAME' + padding : int or str + Padding size, or ['VALID', 'SAME'] Returns -------