diff --git a/src/codegen/codegen_cuda.h b/src/codegen/codegen_cuda.h
index 85da32320b71bce2ac1e7ad5513c04eda52f91c3..dc95a08c904a108206e41e7706f16418a668f075 100644
--- a/src/codegen/codegen_cuda.h
+++ b/src/codegen/codegen_cuda.h
@@ -38,7 +38,7 @@ class CodeGenCUDA final : public CodeGenC {
  private:
   // magic number to add pragma unroll to it.
   // used to generate code that is compact but still unrolls.
-  int max_auto_unroll_{256};
+  int max_auto_unroll_{32};
   // Whether global barrier is needed.
   bool need_global_barrier_{false};
   // Global barrier state