diff --git a/src/codegen/llvm/intrin_rule_rocm.cc b/src/codegen/llvm/intrin_rule_rocm.cc index e64a4fce4f2747b031eb58edd842cfb04d1d0b69..fa27701870f9007afab7320d00a7cb174fa22077 100644 --- a/src/codegen/llvm/intrin_rule_rocm.cc +++ b/src/codegen/llvm/intrin_rule_rocm.cc @@ -29,8 +29,10 @@ namespace llvm { TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.exp") .set_body(DispatchExternOCML); +// On AMD GPU, fma is slower than mac +// removing fma dispatch allows backend to generate faster mac instruction TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.fma") -.set_body(DispatchExternOCML); +.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::fmuladd, 1>); TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.log") .set_body(DispatchExternOCML);