From 80f9ca00b2643000c619288f3d7a855083571195 Mon Sep 17 00:00:00 2001
From: Zhixun Tan <phisiart@gmail.com>
Date: Thu, 25 Jan 2018 13:35:24 -0500
Subject: [PATCH] [OpenGL] Let OpenGL texture always be 1024 x nrows. (#817)

* OpenGL texture is always 1024 x nrows.

* Address review comments.
---
 src/codegen/codegen_opengl.cc           | 11 +--
 src/runtime/opengl/opengl_device_api.cc | 91 +++++++++++++++++++++----
 src/runtime/opengl/opengl_module.h      | 13 ++++
 tests/webgl/test_local_gemm.py          |  2 +-
 4 files changed, 98 insertions(+), 19 deletions(-)

diff --git a/src/codegen/codegen_opengl.cc b/src/codegen/codegen_opengl.cc
index e645e7f6c..496b15b34 100644
--- a/src/codegen/codegen_opengl.cc
+++ b/src/codegen/codegen_opengl.cc
@@ -154,7 +154,8 @@ void CodeGenOpenGL::BindThreadIndex(const IterVar& iv) {
 
   // Declare threadIdx local variable.
   this->PrintIndent();
-  this->stream << "ivec2 threadIdx = ivec2(gl_FragCoord.xy);\n";
+  this->stream << "ivec2 threadIdx = ivec2(" << runtime::kTextureRowSize
+               << " * int(gl_FragCoord.y) + int(gl_FragCoord.x), 0);\n";
 
   // Return directly if threadIdx.x >= thread_extent.
   this->PrintIndent();
@@ -192,12 +193,14 @@ void CodeGenOpenGL::VisitStmt_(const Store* op) {
   }
 }
 
-// texelFetch(tex, ivec2(idx, 0), 0).r
+// texelFetch(tex, ivec2(idx & kTextureRowMask, idx >> kTextureRowBits), 0).r
 std::string CodeGenOpenGL::TexelFetch(const Variable* buffer, Expr index) {
   std::ostringstream os;
-  os << "texelFetch(" << GetVarID(buffer) << ", ivec2(";
+  os << "texelFetch(" << GetVarID(buffer) << ", ivec2(int(";
   PrintExpr(index, os);
-  os << ", 0), 0).r";
+  os << ") & " << runtime::kTextureRowMask << ", int(";
+  PrintExpr(index, os);
+  os << ") >> " << runtime::kTextureRowBits << "), 0).r";
   return os.str();
 }
 
diff --git a/src/runtime/opengl/opengl_device_api.cc b/src/runtime/opengl/opengl_device_api.cc
index 798003af9..d90d12034 100644
--- a/src/runtime/opengl/opengl_device_api.cc
+++ b/src/runtime/opengl/opengl_device_api.cc
@@ -3,6 +3,7 @@
  * \file opengl_device_api.cc
  */
 #include "./opengl_common.h"
+#include "./opengl_module.h"
 
 #if TVM_OPENGL_RUNTIME
 
@@ -347,8 +348,9 @@ Texture OpenGLWorkspace::CreateTexture(TVMType type, size_t nbytes) {
 
   // Use glTexImage2D with nullptr data to specify GPU data storage.
   auto texture_format = GetTextureFormat(type);
-  auto width = static_cast<GLsizei>(nbytes / (type.bits / 8));
-  auto height = GLsizei(1);
+  auto nelems = static_cast<GLsizei>(nbytes / (type.bits / 8));
+  auto height = (nelems + kTextureRowSize - 1) / kTextureRowSize;
+  auto width = (height == 1) ? nelems : kTextureRowSize;
   OPENGL_CALL(gl->TexImage2D(GL_TEXTURE_2D, /*level=*/0,
                              texture_format.internal_format,
                              width, height, /*border=*/0,
@@ -402,6 +404,51 @@ Program OpenGLWorkspace::CreateProgram(GLuint fragment_shader) {
   return Program(this, program);
 }
 
+/*!
+ * \brief Visit a 1D range of an OpenGL texture-backed TVM array.
+ * When getting/setting a sub image of a texture, we can only specify a 2D
+ * block (xbeg, ybeg, width, height).
+ * Since we are storing all TVM arrays using (kTextureRowSize x nrows) 2D
+ * textures (row-major), a range in an array does not necessarily map to a 2D
+ * block.
+ * This function split a 1D range into 3 2D blocks.
+ * \param beg The index of the first element in the 1D range.
+ * \param end The index of the last + 1 element in the 1D range.
+ * \param on_2d_block Callback for each 2D block. Must have interface
+ * void(GLint xbeg, GLint ybeg, GLsizei width, GLsizei height).
+ */
+template <typename F>
+static void Visit1DRange(GLint beg, GLint end, F&& on_2d_block) {
+  CHECK_LE(beg, end) << "Invalid range.";
+
+  //           xbeg         kTextureRowSize
+  // ybeg  ....************
+  //       ****************
+  //       ****************
+  // ylast *********.......
+  //           xlast
+  GLint xbeg = beg % kTextureRowSize;
+  GLint ybeg = beg / kTextureRowSize;
+  GLint xlast = (end - 1) % kTextureRowSize;
+  GLint ylast = (end - 1) / kTextureRowSize;
+
+  if (ybeg == ylast) {  // Only one row.
+    on_2d_block(xbeg, ybeg, end - beg, 1);
+    return;
+  }
+
+  // First row.
+  on_2d_block(xbeg, ybeg, kTextureRowSize - xbeg, 1);
+
+  // Middle block.
+  if (ylast - ybeg > 1) {
+    on_2d_block(0, ybeg + 1, kTextureRowSize, ylast - ybeg - 1);
+  }
+
+  // Last row.
+  on_2d_block(0, ylast, xlast + 1, 1);
+}
+
 void OpenGLWorkspace::PutTextureData(Texture *texture,
                                      GLint begin,
                                      GLsizei nelems,
@@ -409,12 +456,17 @@ void OpenGLWorkspace::PutTextureData(Texture *texture,
   // Bind to temporary unit.
   BindTextureUnit(NumTextureUnits() - 1, texture->texture());
 
-  // Similar to cudaMemcpy.
-  OPENGL_CALL(gl->TexSubImage2D(GL_TEXTURE_2D, /*level=*/0,
-                                /*xoffset=*/begin, /*yoffset=*/0,
-                                /*width=*/nelems, /*height=*/1,
-                                texture->format_.format, texture->format_.type,
-                                data));
+  Visit1DRange(begin, begin + nelems, [&](GLint xbeg, GLint ybeg,
+                                          GLsizei width, GLsizei height) {
+    auto offset = (ybeg * kTextureRowSize + xbeg - begin) * texture->elemsz();
+    const GLvoid* ptr = static_cast<const char*>(data) + offset;
+
+    // Similar to cudaMemcpy.
+    OPENGL_CALL(gl->TexSubImage2D(GL_TEXTURE_2D, /*level=*/0,
+                                  xbeg, ybeg, width, height,
+                                  texture->format_.format,
+                                  texture->format_.type, ptr));
+  });
 }
 
 void OpenGLWorkspace::GetTextureData(const Texture *texture,
@@ -453,18 +505,29 @@ void OpenGLWorkspace::GetTextureData(const Texture *texture,
   auto nchannels = 4;
   auto padded_data_size = nchannels * nelems * elemsz;
   auto padded_data = std::unique_ptr<char[]>(new char[padded_data_size]);
-  OPENGL_CALL(gl->ReadPixels(/*x=*/begin, /*y=*/0, /*width=*/nelems,
-                             /*height=*/1, GL_RGBA, GL_FLOAT,
-                             padded_data.get()));
+  Visit1DRange(begin, begin + nelems, [&](GLint xbeg, GLint ybeg,
+                                          GLsizei width, GLsizei height) {
+    auto data_offset = (ybeg * kTextureRowSize + xbeg - begin) * elemsz;
+    auto padded_data_offset = data_offset * nchannels;
+    OPENGL_CALL(gl->ReadPixels(xbeg, ybeg, width, height,
+                               GL_RGBA, GL_FLOAT,
+                               padded_data.get() + padded_data_offset));
+  });
   for (GLsizei i = 0; i != nelems; ++i) {
     auto dst = reinterpret_cast<char *>(data) + i * elemsz;
     auto src = padded_data.get() + nchannels * i * elemsz;
     std::memcpy(dst, src, elemsz);
   }
 #else
-  OPENGL_CALL(gl->ReadPixels(/*x=*/begin, /*y=*/0, /*width=*/nelems,
-                             /*height=*/1, texture->format_.format,
-                             texture->format_.type, data));
+  Visit1DRange(begin, begin + nelems, [&](GLint xbeg, GLint ybeg,
+                                          GLsizei width, GLsizei height) {
+    auto offset = (ybeg * kTextureRowSize + xbeg - begin) * texture->elemsz();
+    GLvoid* ptr = static_cast<char*>(data) + offset;
+
+    OPENGL_CALL(gl->ReadPixels(xbeg, ybeg, width, height,
+                               texture->format_.format, texture->format_.type,
+                               ptr));
+  });
 #endif
 
   OPENGL_CALL(gl->DeleteFramebuffers(1, &frame_buffer));
diff --git a/src/runtime/opengl/opengl_module.h b/src/runtime/opengl/opengl_module.h
index 1913878c3..a4cfa20bd 100644
--- a/src/runtime/opengl/opengl_module.h
+++ b/src/runtime/opengl/opengl_module.h
@@ -17,6 +17,19 @@
 namespace tvm {
 namespace runtime {
 
+/*!
+ * \brief The fixed row size of all OpenGL textures in TVM.
+ *
+ * OpenGL has texture size limit on each dimension. Suppose we have a limit of
+ * 1024, then we can have a 2D texture of size (2^10 x 2^10) but not (2^20 x 1).
+ * This means we don't want to just use (n x 1) 2D textures for all arrays,
+ * because that would limit our array size to be 1024. Here we use (1024 x m)
+ * 2D textures. Then we can have arrays of size up to 2^20.
+ */
+static constexpr int kTextureRowBits = 10;
+static constexpr int kTextureRowSize = 1 << kTextureRowBits;
+static constexpr int kTextureRowMask = kTextureRowSize - 1;
+
 /*!
  * \brief Determines how we supply arguments.
  */
diff --git a/tests/webgl/test_local_gemm.py b/tests/webgl/test_local_gemm.py
index 18d2d1d8b..0dd1c0fc7 100644
--- a/tests/webgl/test_local_gemm.py
+++ b/tests/webgl/test_local_gemm.py
@@ -7,7 +7,7 @@ def test_local_gemm():
     if not tvm.module.enabled("llvm"):
         return
 
-    nn = 2
+    nn = 1024
     n = tvm.var('n')
     n = tvm.convert(nn)
     m = n
-- 
GitLab