import tvm
import numpy

def mock_test_add():
    """Not yet working, mock design"""
    n = tvm.Var('n')
    A = tvm.placeholder((n,), name='A')
    B = tvm.placeholder((n,), name='B')
    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    s = tvm.Schedule(C.op)

    # GPU schedule have to split by gridIdx and threadIdx
    num_thread = 256
    grid_x = tvm.IterVar(thread_tag="gridIdx.x")
    thread_x = tvm.IterVar((0, num_thread), thread_tag="threadIdx.x")
    _, x = s[C].split(C.op.axis[0], factor=num_thread, outer=grid_x)
    _, x = s[C].split(x, outer=thread_x)
    # compile to IR
    bounds = tvm.schedule.InferBound(s)
    stmt = tvm.ir_pass.ScheduleOps(s, bounds)

    Ab = tvm.Buffer(A.shape, A.dtype, name='A')
    Bb = tvm.Buffer(B.shape, B.dtype, name='B')
    Cb = tvm.Buffer(C.shape, C.dtype, name='C')

    def codegen():
        # generate host/device code
        host_code, device_code = tvm.codegen.GenCUDA(
            s,
            inputs={A: Ab, B:Bb},
            outputs={C: Cb},
            args=[A, B, C])
        # generate a function based on the code
        f = tvm.cuda.build_function(host_code, device_code)
        # create arrays
        a = tvm.nd.array(np.ones(10), ctx=tvm.gpu(0))
        b = tvm.nd.array(np.ones(10), ctx=tvm.gpu(0))
        c = tvm.nd.array(np.zeros(10), ctx=tvm.gpu(0))
        # calll the generated code
        f(a, b, c)
        # sync the result
        np.testing.assert_equal(c.asnumpy(), np.ones(10) * 2)


if __name__ == "__main__":
    mock_test_add()