diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index aa8b4322368a96e824864b0784a4cedd1bd4ad5b..e43cc1f318ec9958b02977018284ce766175ccca 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -21,7 +21,9 @@ enum DeviceAttrKind : int { kWarpSize = 2, kMaxSharedMemoryPerBlock = 3, kComputeVersion = 4, - kDeviceName = 5 + kDeviceName = 5, + kMaxClockRate = 6, + kMultiProcessorCount = 7 }; /*! \brief Number of bytes each allocation must align to */ diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py index 3e947cb6366ce3ba14d876dc2d6a2bee8be9367d..3fc020c8781bc13437af14fb981265d0eff87f32 100644 --- a/python/tvm/_ffi/runtime_ctypes.py +++ b/python/tvm/_ffi/runtime_ctypes.py @@ -166,6 +166,18 @@ class TVMContext(ctypes.Structure): return _api_internal._GetDeviceAttr( self.device_type, self.device_id, 5) + @property + def max_clock_rate(self): + """Return the max clock frequency of device.""" + return _api_internal._GetDeviceAttr( + self.device_type, self.device_id, 6) + + @property + def multi_processor_count(self): + """Return the number of compute units of device.""" + return _api_internal._GetDeviceAttr( + self.device_type, self.device_id, 7) + def sync(self): """Synchronize until jobs finished at the context.""" check_call(_LIB.TVMSynchronize(self.device_type, self.device_id, None)) diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index fe13e466ba801d38674c226cb5f4f906c985a3bc..3f697faab72ca73b8bb9600337983e174eb7cce1 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -62,6 +62,16 @@ class CUDADeviceAPI final : public DeviceAPI { *rv = std::string(props.name); return; } + case kMaxClockRate: { + CUDA_CALL(cudaDeviceGetAttribute( + &value, cudaDevAttrClockRate, ctx.device_id)); + break; + } + case kMultiProcessorCount: { + CUDA_CALL(cudaDeviceGetAttribute( + &value, cudaDevAttrMultiProcessorCount, ctx.device_id)); + break; + } } *rv = value; } diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm index 077d2546c151db5ccaa43219176b45be5b8d144f..6d225ea7f888c6d0ca0bfc7e9d79c6435021849b 100644 --- a/src/runtime/metal/metal_device_api.mm +++ b/src/runtime/metal/metal_device_api.mm @@ -42,6 +42,8 @@ void MetalWorkspace::GetAttr( case kMaxSharedMemoryPerBlock: return; case kComputeVersion: return; case kDeviceName: return; + case kMaxClockRate: return; + case kMultiProcessorCount: return; case kExist: break; } } diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc index 40f34a65283086a5f4e56739c068f4690be3a9ad..da527c76134dc4754daf41ba540acc981f24eaae 100644 --- a/src/runtime/opencl/opencl_device_api.cc +++ b/src/runtime/opencl/opencl_device_api.cc @@ -42,6 +42,11 @@ void OpenCLWorkspace::GetAttr( break; } case kWarpSize: { + /* TODO: the warp size of OpenCL device is not always 1 + e.g. Intel GPU has a sub group concept which contains 8 - 32 work items, + corresponding to the number of SIMD entries the heardware configures. + We need to figure out a way to query this information from the hardware. + */ *rv = 1; break; } @@ -62,6 +67,22 @@ void OpenCLWorkspace::GetAttr( *rv = std::string(value); break; } + case kMaxClockRate: { + cl_uint value; + OPENCL_CALL(clGetDeviceInfo( + devices[index], CL_DEVICE_MAX_CLOCK_FREQUENCY, + sizeof(cl_uint), &value, nullptr)); + *rv = static_cast<int32_t>(value); + break; + } + case kMultiProcessorCount: { + cl_uint value; + OPENCL_CALL(clGetDeviceInfo( + devices[index], CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(cl_uint), &value, nullptr)); + *rv = static_cast<int32_t>(value); + break; + } case kExist: break; } } diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc index bde7e6b2741875adbae1f5813e020e6a6f3ffaf3..d8831880fd8dad13b862d7c5ef340648e993b605 100644 --- a/src/runtime/opencl/opencl_module.cc +++ b/src/runtime/opencl/opencl_module.cc @@ -176,7 +176,7 @@ class OpenCLModuleNode : public ModuleNode { class OpenCLWrappedFunc { public: - // initialize the CUDA function. + // initialize the OpenCL function. void Init(OpenCLModuleNode* m, std::shared_ptr<ModuleNode> sptr, OpenCLModuleNode::KTRefEntry entry, diff --git a/src/runtime/opengl/opengl_device_api.cc b/src/runtime/opengl/opengl_device_api.cc index 0c6354e231cfa012dccbd510e1c22b5c31c0f03b..e925f863df9d293c1f6014f5182234acf86c10ce 100644 --- a/src/runtime/opengl/opengl_device_api.cc +++ b/src/runtime/opengl/opengl_device_api.cc @@ -98,6 +98,8 @@ void OpenGLWorkspace::GetAttr( break; } case kDeviceName: return; + case kMaxClockRate: return; + case kMultiProcessorCount: return; } } diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc index 256c715e92b8d853bd9f2c1e70101912da674e73..55272561c739d19efb12111a4c6d6f604c0f6dc2 100644 --- a/src/runtime/rocm/rocm_device_api.cc +++ b/src/runtime/rocm/rocm_device_api.cc @@ -52,6 +52,8 @@ class ROCMDeviceAPI final : public DeviceAPI { return; } case kDeviceName: return; + case kMaxClockRate: return; + case kMultiProcessorCount: return; } *rv = value; } diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc index aaf658bbac4b73b8ed43dfa423965c9c7520df3a..a3e4fc294e607ae0ab267b6d07957589e2bf5126 100644 --- a/src/runtime/vulkan/vulkan_device_api.cc +++ b/src/runtime/vulkan/vulkan_device_api.cc @@ -74,6 +74,8 @@ void VulkanWorkspace::GetAttr( break; } case kDeviceName: return; + case kMaxClockRate: return; + case kMultiProcessorCount: return; case kExist: break; } }