diff --git a/python/tvm/function.py b/python/tvm/function.py
index 72ec0d2680dea2520e955ba031c238d1601b6d53..72929da80188988063e075e4e36b720950e3bce4 100644
--- a/python/tvm/function.py
+++ b/python/tvm/function.py
@@ -117,11 +117,13 @@ def compute(shape, fcompute, name="compute"):
         The created tensor
     shape = (shape,) if isinstance(shape, _expr.Expr) else shape
     ndim = len(shape)
     arg_names = fcompute.__code__.co_varnames
+    if fcompute.__code__.co_argcount == 0 and len(arg_names) == 1:
+        arg_names = ["i%d" % i for i in range(ndim)]
     if ndim != len(arg_names):
-        raise ValueError("fcompute do not match dimension")
+        raise ValueError("fcompute do not match dimension, ndim=%d" % ndim)
     dim_var = [IterVar((0, s), x) for x, s in zip(arg_names, shape)]
     body = fcompute(*[v.var for v in dim_var])
@@ -170,7 +172,7 @@ def Buffer(shape, dtype=None,
         name, ptr, shape, strides, dtype)
-def IterVar(dom, name='iter', thread_tag=''):
+def IterVar(dom=None, name=None, thread_tag=''):
     """Create a iteration variable
@@ -189,14 +191,17 @@ def IterVar(dom, name='iter', thread_tag=''):
     iter_var : IterVar
        The result itervar
-    if isinstance(dom, (list, tuple)):
-        if len(dom) != 2:
-            raise ValueError("need to list of ranges")
-        dom = Range(dom[0], dom[1])
-    if not isinstance(dom, _collections.Range):
-        raise ValueError("dom need to be Range")
+    if dom is not None:
+        if isinstance(dom, (list, tuple)):
+            if len(dom) != 2:
+                raise ValueError("need to list of ranges")
+            dom = Range(dom[0], dom[1])
+        if not isinstance(dom, _collections.Range):
+            raise ValueError("dom need to be Range")
+    if name is None:
+        name = thread_tag if thread_tag else name
+    name = name if name else 'iter'
     return _function_internal._IterVar(dom, name, thread_tag)
diff --git a/python/tvm/schedule.py b/python/tvm/schedule.py
index b46c5866082fe0c0777c10bad2109c257b668aa2..b276c90a14a6e5c5318ddc036ec40d8498bff0e4 100644
--- a/python/tvm/schedule.py
+++ b/python/tvm/schedule.py
@@ -56,8 +56,6 @@ class Stage(NodeBase):
         if outer is not None:
             if outer.thread_tag == '':
                 raise ValueError("split by outer must have special thread_tag")
-            if outer.dom is None:
-                raise ValueError("split by outer must have specified domain")
             inner = _function_internal._StageSplitByOuter(self, parent, outer, factor)
             if factor is None:
diff --git a/src/schedule/bound.cc b/src/schedule/bound.cc
index 6a23f48d8c90fd7247bc115aadeb8c338805f48f..d4ce520c9229c23ee9d533456dba0150d0fd3067 100644
--- a/src/schedule/bound.cc
+++ b/src/schedule/bound.cc
@@ -5,6 +5,7 @@
 #include <tvm/ir.h>
 #include <tvm/ir_visitor.h>
+#include <tvm/ir_pass.h>
 #include <tvm/schedule_pass.h>
 #include "./int_set.h"
 #include "./graph.h"
@@ -14,7 +15,7 @@ namespace schedule {
 // result = ceil((a / b)), both a and b are positive integer
 inline Expr DivCeil(Expr a, Expr b) {
-  return (a + b - 1) / b;
+  return ir::Simplify((a + b - 1) / b);
 // Downward message passing algorithm on stage schedule s,
diff --git a/tests/python/test_codegen_cuda.py b/tests/python/test_codegen_cuda.py
new file mode 100644
index 0000000000000000000000000000000000000000..b93e80e520592e21fda6641cbaa1a06db29b265a
--- /dev/null
+++ b/tests/python/test_codegen_cuda.py
@@ -0,0 +1,46 @@
+import tvm
+import numpy
+def mock_test_add():
+    """Not yet working, mock design"""
+    n = tvm.Var('n')
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = tvm.Schedule(C.op)
+    # GPU schedule have to split by gridIdx and threadIdx
+    num_thread = 256
+    grid_x = tvm.IterVar(thread_tag="gridIdx.x")
+    thread_x = tvm.IterVar((0, num_thread), thread_tag="threadIdx.x")
+    _, x = s[C].split(C.op.axis[0], factor=num_thread, outer=grid_x)
+    _, x = s[C].split(x, outer=thread_x)
+    # compile to IR
+    bounds = tvm.schedule.InferBound(s)
+    stmt = tvm.ir_pass.ScheduleOps(s, bounds)
+    Ab = tvm.Buffer(A.shape, A.dtype, name='A')
+    Bb = tvm.Buffer(B.shape, B.dtype, name='B')
+    Cb = tvm.Buffer(C.shape, C.dtype, name='C')
+    def codegen():
+        # generate host/device code
+        host_code, device_code = tvm.codegen.GenCUDA(
+            s,
+            inputs={A: Ab, B:Bb},
+            outputs={C: Cb},
+            args=[A, B, C])
+        # generate a function based on the code
+        f = tvm.cuda.build_function(host_code, device_code)
+        # create arrays
+        a = tvm.nd.array(np.ones(10), ctx=tvm.gpu(0))
+        b = tvm.nd.array(np.ones(10), ctx=tvm.gpu(0))
+        c = tvm.nd.array(np.zeros(10), ctx=tvm.gpu(0))
+        # calll the generated code
+        f(a, b, c)
+        # sync the result
+        np.testing.assert_equal(c.asnumpy(), np.ones(10) * 2)
+if __name__ == "__main__":
+    mock_test_add()