diff --git a/topi/python/topi/cuda/depthwise_conv2d.py b/topi/python/topi/cuda/depthwise_conv2d.py
index d0cfde03f3d95996e879971aa397a1f89b79e20c..b92ccba1ff05337b3cb2d373c03debe11c407aff 100644
--- a/topi/python/topi/cuda/depthwise_conv2d.py
+++ b/topi/python/topi/cuda/depthwise_conv2d.py
@@ -20,7 +20,10 @@ def schedule_depthwise_conv2d_nchw(outs):
     outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs
     s = tvm.create_schedule([x.op for x in outs])
     def _schedule(PaddedInput, Filter, DepthwiseConv2d):
+        in_shape = get_const_tuple(PaddedInput.shape)
         out_shape = get_const_tuple(DepthwiseConv2d.shape)
+        in_height = in_shape[2]
+        in_width = in_shape[3]
         out_height = out_shape[2]
         out_width = out_shape[3]
         channel_multiplier = get_const_tuple(Filter.shape)[1]
@@ -42,12 +45,14 @@ def schedule_depthwise_conv2d_nchw(outs):
         num_vthread_x = 1
         blocking_h = out_height
         blocking_w = out_width
-        if out_height % 32 == 0:
+        if out_height % 32 == 0 or in_height >= 108:
             blocking_h = 32
         if out_width % 32 == 0:
             blocking_w = 32
             num_thread_x = 16
             num_vthread_x = 2
+        elif in_width >= 108:
+            blocking_w = 32
         block_y = tvm.thread_axis("blockIdx.y")
         block_x = tvm.thread_axis("blockIdx.x")
         thread_y = tvm.thread_axis((0, num_thread_y), "threadIdx.y")
diff --git a/topi/python/topi/nn/convolution.py b/topi/python/topi/nn/convolution.py
index 4fadb03dc7028ba399c068391977c86b5fb40008..d6a3c4ca6fdf88bd044ba1b7b511637f57ad9083 100644
--- a/topi/python/topi/nn/convolution.py
+++ b/topi/python/topi/nn/convolution.py
@@ -121,8 +121,8 @@ def depthwise_conv2d_nchw(Input, Filter, stride, padding):
     stride : tuple of two ints
         The spatial stride along height and width
 
-    padding : str
-        'VALID' or 'SAME'
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
 
     Returns
     -------
@@ -169,8 +169,8 @@ def depthwise_conv2d_nhwc(Input, Filter, stride, padding):
     Stride : tvm.Tensor
         1-D of size 2
 
-    padding : str
-        'VALID' or 'SAME'
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
 
     Returns
     -------