diff --git a/src/codegen/llvm/intrin_rule_rocm.cc b/src/codegen/llvm/intrin_rule_rocm.cc
index e64a4fce4f2747b031eb58edd842cfb04d1d0b69..fa27701870f9007afab7320d00a7cb174fa22077 100644
--- a/src/codegen/llvm/intrin_rule_rocm.cc
+++ b/src/codegen/llvm/intrin_rule_rocm.cc
@@ -29,8 +29,10 @@ namespace llvm {
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.exp")
 .set_body(DispatchExternOCML);
 
+// On AMD GPU, fma is slower than mac
+// removing fma dispatch allows backend to generate faster mac instruction
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.fma")
-.set_body(DispatchExternOCML);
+.set_body(DispatchLLVMPureIntrin<::llvm::Intrinsic::fmuladd, 1>);
 
 TVM_REGISTER_GLOBAL("tvm.intrin.rule.rocm.log")
 .set_body(DispatchExternOCML);