-
Tianqi Chen authoredTianqi Chen authored
test_codegen_cuda.py 1.50 KiB
import tvm
import numpy
def mock_test_add():
"""Not yet working, mock design"""
n = tvm.Var('n')
A = tvm.placeholder((n,), name='A')
B = tvm.placeholder((n,), name='B')
C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
s = tvm.Schedule(C.op)
# GPU schedule have to split by gridIdx and threadIdx
num_thread = 256
grid_x = tvm.IterVar(thread_tag="gridIdx.x")
thread_x = tvm.IterVar((0, num_thread), thread_tag="threadIdx.x")
_, x = s[C].split(C.op.axis[0], factor=num_thread, outer=grid_x)
_, x = s[C].split(x, outer=thread_x)
# compile to IR
bounds = tvm.schedule.InferBound(s)
stmt = tvm.ir_pass.ScheduleOps(s, bounds)
Ab = tvm.Buffer(A.shape, A.dtype, name='A')
Bb = tvm.Buffer(B.shape, B.dtype, name='B')
Cb = tvm.Buffer(C.shape, C.dtype, name='C')
def codegen():
# generate host/device code
host_code, device_code = tvm.codegen.GenCUDA(
s,
inputs={A: Ab, B:Bb},
outputs={C: Cb},
args=[A, B, C])
# generate a function based on the code
f = tvm.cuda.build_function(host_code, device_code)
# create arrays
a = tvm.nd.array(np.ones(10), ctx=tvm.gpu(0))
b = tvm.nd.array(np.ones(10), ctx=tvm.gpu(0))
c = tvm.nd.array(np.zeros(10), ctx=tvm.gpu(0))
# calll the generated code
f(a, b, c)
# sync the result
np.testing.assert_equal(c.asnumpy(), np.ones(10) * 2)
if __name__ == "__main__":
mock_test_add()