From 52f045171f8fa49500669120f05b5d52a02d1e13 Mon Sep 17 00:00:00 2001
From: tqchen <tianqi.tchen@gmail.com>
Date: Wed, 30 May 2018 20:41:30 -0700
Subject: [PATCH] [RUNTIME] update to make runtime copy type aware

---
 include/tvm/runtime/device_api.h              |  7 ++-
 nnvm/tests/python/compiler/test_param_dict.py | 44 ++++++++++++++
 src/codegen/verilog/vpi_device_api.cc         |  1 +
 src/runtime/c_runtime_api.cc                  |  6 +-
 src/runtime/cpu_device_api.cc                 |  1 +
 src/runtime/cuda/cuda_device_api.cc           |  1 +
 src/runtime/metal/metal_common.h              |  1 +
 src/runtime/metal/metal_device_api.mm         |  1 +
 src/runtime/opencl/opencl_common.h            |  1 +
 src/runtime/opencl/opencl_device_api.cc       |  1 +
 src/runtime/opengl/opengl_device_api.cc       |  1 +
 src/runtime/rocm/rocm_device_api.cc           |  1 +
 src/runtime/rpc/rpc_device_api.cc             |  8 +--
 src/runtime/rpc/rpc_session.cc                | 58 ++++++++++++++-----
 src/runtime/rpc/rpc_session.h                 | 16 +++--
 src/runtime/vulkan/vulkan_common.h            |  1 +
 src/runtime/vulkan/vulkan_device_api.cc       |  1 +
 tests/python/unittest/test_runtime_rpc.py     | 33 +++++++++++
 18 files changed, 155 insertions(+), 28 deletions(-)

diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h
index 334d3e61b..82128fc58 100644
--- a/include/tvm/runtime/device_api.h
+++ b/include/tvm/runtime/device_api.h
@@ -81,18 +81,21 @@ class DeviceAPI {
    * \param from_offset The byte offeset in the from.
    * \param to The target array.
    * \param to_offset The byte offset in the to.
-   * \param size The size of the memory
+   * \param num_bytes The size of the memory in bytes
    * \param ctx_from The source context
    * \param ctx_to The target context
+   * \param type_hint The type of elements, only neded by certain backends.
+   *                  can be useful for cross device endian converison.
    * \param stream Optional stream object.
    */
   virtual void CopyDataFromTo(const void* from,
                               size_t from_offset,
                               void* to,
                               size_t to_offset,
-                              size_t size,
+                              size_t num_bytes,
                               TVMContext ctx_from,
                               TVMContext ctx_to,
+                              TVMType type_hint,
                               TVMStreamHandle stream) = 0;
     /*!
    * \brief Create a new stream of execution.
diff --git a/nnvm/tests/python/compiler/test_param_dict.py b/nnvm/tests/python/compiler/test_param_dict.py
index 667ae4e48..eb547373e 100644
--- a/nnvm/tests/python/compiler/test_param_dict.py
+++ b/nnvm/tests/python/compiler/test_param_dict.py
@@ -1,5 +1,9 @@
+import os
 import numpy as np
 import nnvm.compiler
+import tvm
+from tvm.contrib import rpc, util, graph_runtime
+
 
 def test_save_load():
     x = np.random.uniform(size=(10, 2)).astype("float32")
@@ -15,5 +19,45 @@ def test_save_load():
     np.testing.assert_equal(param2["y"].asnumpy(), y)
 
 
+def test_bigendian_rpc_param():
+    """Test big endian rpc when there is a PowerPC RPC server available"""
+    host = os.environ.get("TVM_POWERPC_TEST_HOST", None)
+    port = os.environ.get("TVM_POWERPC_TEST_PORT", 9090)
+    if host is None:
+        return
+
+    def verify_nnvm(remote, target, shape, dtype):
+        x = nnvm.sym.Variable("x")
+        y = x + 1
+        graph, lib, _ = nnvm.compiler.build(
+            y, target,
+            shape={"x": shape},
+        dtype={"x": dtype})
+
+        temp = util.tempdir()
+        path_dso = temp.relpath("dev_lib.o")
+        lib.save(path_dso)
+        remote.upload(path_dso)
+        lib = remote.load_module("dev_lib.o")
+        a = np.random.randint(0, 256, size=shape).astype(dtype)
+        a[:] = 1
+        params = {"x" : a}
+        ctx = remote.cpu(0)
+        m = graph_runtime.create(graph, lib, ctx)
+        # uses save param_dict
+        m.load_params(nnvm.compiler.save_param_dict(params))
+        m.run()
+        out = m.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx))
+        np.testing.assert_allclose(a + 1, out.asnumpy())
+
+    print("Test RPC connection to PowerPC...")
+    remote = rpc.connect(host, port)
+    target = "llvm -mtriple=powerpc-linux-gnu"
+    for dtype in ["float32", "float64", "int32", "int8"]:
+        verify_nnvm(remote, target, (10,), dtype)
+
+
+
 if __name__ == "__main__":
     test_save_load()
+    test_bigendian_rpc_param()
diff --git a/src/codegen/verilog/vpi_device_api.cc b/src/codegen/verilog/vpi_device_api.cc
index 8efd65785..d53a12962 100644
--- a/src/codegen/verilog/vpi_device_api.cc
+++ b/src/codegen/verilog/vpi_device_api.cc
@@ -93,6 +93,7 @@ class VPIDeviceAPI final : public runtime::DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final {
     if (static_cast<int>(ctx_from.device_type) == kDLVPI) {
       from = RealAddr(static_cast<const char*>(from) + from_offset, size);
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 5c4ac57fe..df3a19f30 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -434,7 +434,7 @@ int TVMArrayCopyFromTo(TVMArrayHandle from,
   DeviceAPIManager::Get(ctx)->CopyDataFromTo(
     from->data, static_cast<size_t>(from->byte_offset),
     to->data, static_cast<size_t>(to->byte_offset),
-    from_size, from->ctx, to->ctx, stream);
+    from_size, from->ctx, to->ctx, from->dtype, stream);
 
   API_END();
 }
@@ -452,7 +452,7 @@ int TVMArrayCopyFromBytes(TVMArrayHandle handle,
   DeviceAPIManager::Get(handle->ctx)->CopyDataFromTo(
       data, 0,
       handle->data, static_cast<size_t>(handle->byte_offset),
-      nbytes, cpu_ctx, handle->ctx, nullptr);
+      nbytes, cpu_ctx, handle->ctx, handle->dtype, nullptr);
   API_END();
 }
 
@@ -469,7 +469,7 @@ int TVMArrayCopyToBytes(TVMArrayHandle handle,
   DeviceAPIManager::Get(handle->ctx)->CopyDataFromTo(
       handle->data, static_cast<size_t>(handle->byte_offset),
       data, 0,
-      nbytes, handle->ctx, cpu_ctx, nullptr);
+      nbytes, handle->ctx, cpu_ctx, handle->dtype, nullptr);
   API_END();
 }
 
diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc
index 8199988f3..e3434e018 100644
--- a/src/runtime/cpu_device_api.cc
+++ b/src/runtime/cpu_device_api.cc
@@ -53,6 +53,7 @@ class CPUDeviceAPI final : public DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final {
     memcpy(static_cast<char*>(to) + to_offset,
            static_cast<const char*>(from) + from_offset,
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index 3f697faab..9aeca5058 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -99,6 +99,7 @@ class CUDADeviceAPI final : public DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final {
     cudaStream_t cu_stream = static_cast<cudaStream_t>(stream);
     from = static_cast<const char*>(from) + from_offset;
diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h
index fa73b8250..2f75407f9 100644
--- a/src/runtime/metal/metal_common.h
+++ b/src/runtime/metal/metal_common.h
@@ -75,6 +75,7 @@ class MetalWorkspace final : public DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final;
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
   void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
diff --git a/src/runtime/metal/metal_device_api.mm b/src/runtime/metal/metal_device_api.mm
index 6d225ea7f..dcde6945a 100644
--- a/src/runtime/metal/metal_device_api.mm
+++ b/src/runtime/metal/metal_device_api.mm
@@ -158,6 +158,7 @@ void MetalWorkspace::CopyDataFromTo(const void* from,
                                     size_t size,
                                     TVMContext ctx_from,
                                     TVMContext ctx_to,
+                                    TVMType type_hint,
                                     TVMStreamHandle stream) {
   this->Init();
   CHECK(stream == nullptr);
diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h
index 67934a078..a97efe408 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -154,6 +154,7 @@ class OpenCLWorkspace final : public DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final;
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
   void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index da527c761..07a82d34f 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -110,6 +110,7 @@ void OpenCLWorkspace::CopyDataFromTo(const void* from,
                                      size_t size,
                                      TVMContext ctx_from,
                                      TVMContext ctx_to,
+                                     TVMType type_hint,
                                      TVMStreamHandle stream) {
   this->Init();
   CHECK(stream == nullptr);
diff --git a/src/runtime/opengl/opengl_device_api.cc b/src/runtime/opengl/opengl_device_api.cc
index e925f863d..b45ee5079 100644
--- a/src/runtime/opengl/opengl_device_api.cc
+++ b/src/runtime/opengl/opengl_device_api.cc
@@ -119,6 +119,7 @@ void OpenGLWorkspace::CopyDataFromTo(const void* from,
                                      size_t size,
                                      TVMContext ctx_from,
                                      TVMContext ctx_to,
+                                     TVMType type_hint,
                                      TVMStreamHandle stream) {
   CHECK(stream == nullptr);
 
diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc
index 55272561c..47b9e8da7 100644
--- a/src/runtime/rocm/rocm_device_api.cc
+++ b/src/runtime/rocm/rocm_device_api.cc
@@ -81,6 +81,7 @@ class ROCMDeviceAPI final : public DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final {
     hipStream_t hip_stream = static_cast<hipStream_t>(stream);
     from = static_cast<const char*>(from) + from_offset;
diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc
index 1eff8e746..5740a393c 100644
--- a/src/runtime/rpc/rpc_device_api.cc
+++ b/src/runtime/rpc/rpc_device_api.cc
@@ -49,6 +49,7 @@ class RPCDeviceAPI final : public DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final {
     int from_dev_type = ctx_from.device_type;
     int to_dev_type = ctx_to.device_type;
@@ -60,19 +61,18 @@ class RPCDeviceAPI final : public DeviceAPI {
           RPCCode::kCopyAmongRemote,
           static_cast<const RemoteSpace*>(from)->data, from_offset,
           static_cast<const RemoteSpace*>(to)->data, to_offset,
-          size,  ctx_from, ctx_to, stream);
+          size,  ctx_from, ctx_to, type_hint, stream);
     } else if (from_dev_type > kRPCSessMask &&
                to_dev_type == kDLCPU) {
       GetSess(ctx_from)->CopyFromRemote(
           static_cast<const RemoteSpace*>(from)->data, from_offset,
-          to, to_offset, size,
-          ctx_from);
+          to, to_offset, size, ctx_from, type_hint);
     } else if (from_dev_type == kDLCPU &&
                to_dev_type > kRPCSessMask) {
       GetSess(ctx_to)->CopyToRemote(
           (void*)from, from_offset,  // NOLINT(*)
           static_cast<const RemoteSpace*>(to)->data, to_offset,
-          size, ctx_to);
+          size, ctx_to, type_hint);
     } else {
       LOG(FATAL) << "expect copy from/to remote or between remote";
     }
diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc
index d5eee1697..4d38fdc01 100644
--- a/src/runtime/rpc/rpc_session.cc
+++ b/src/runtime/rpc/rpc_session.cc
@@ -303,6 +303,7 @@ class RPCSession::EventHandler : public dmlc::Stream {
   std::string temp_data_;
   // Temp variables for copy request state.
   TVMContext copy_ctx_;
+  TVMType copy_dtype_;
   uint64_t copy_handle_, copy_offset_, copy_size_;
   // State switcher
   void SwitchToState(State state) {
@@ -344,11 +345,13 @@ class RPCSession::EventHandler : public dmlc::Stream {
       case kDoCopyFromRemote: {
         this->RequestBytes(sizeof(uint64_t) * 3);
         this->RequestBytes(sizeof(TVMContext));
+        this->RequestBytes(sizeof(TVMType));
         break;
       }
       case kDoCopyToRemote: {
         this->RequestBytes(sizeof(uint64_t) * 3);
         this->RequestBytes(sizeof(TVMContext));
+        this->RequestBytes(sizeof(TVMType));
         break;
       }
       case kCopyAckReceived:
@@ -554,18 +557,30 @@ class RPCSession::EventHandler : public dmlc::Stream {
   }
 
   void HandleCopyFromRemote() {
-    uint64_t handle, offset, size;
+    uint64_t handle, offset, num_bytes;
     TVMContext ctx;
+    TVMType type_hint;
     this->Read(&handle);
     this->Read(&offset);
-    this->Read(&size);
+    this->Read(&num_bytes);
     this->Read(&ctx);
+    this->Read(&type_hint);
+    size_t elem_bytes = (type_hint.bits * type_hint.lanes + 7) / 8;
+
     if (ctx.device_type == kDLCPU) {
       RPCCode code = RPCCode::kCopyAck;
       this->Write(code);
-      this->WriteArray(reinterpret_cast<char*>(handle) + offset, size);
+      char* dptr = reinterpret_cast<char*>(handle) + offset;
+      if (!DMLC_IO_NO_ENDIAN_SWAP) {
+        temp_data_.resize(0);
+        temp_data_.insert(temp_data_.end(), dptr, dptr + num_bytes);
+        dmlc::ByteSwap(dmlc::BeginPtr(temp_data_), elem_bytes, num_bytes / elem_bytes);
+        this->WriteArray(temp_data_.data(), num_bytes);
+      } else {
+        this->WriteArray(dptr, num_bytes);
+      }
     } else {
-      temp_data_.resize(size + 1);
+      temp_data_.resize(num_bytes + 1);
       try {
         TVMContext cpu_ctx;
         cpu_ctx.device_type = kDLCPU;
@@ -573,10 +588,13 @@ class RPCSession::EventHandler : public dmlc::Stream {
         DeviceAPI::Get(ctx)->CopyDataFromTo(
             reinterpret_cast<void*>(handle), offset,
             dmlc::BeginPtr(temp_data_), 0,
-            size, ctx, cpu_ctx, nullptr);
+            num_bytes, ctx, cpu_ctx, type_hint, nullptr);
         RPCCode code = RPCCode::kCopyAck;
         this->Write(code);
-        this->WriteArray(&temp_data_[0], size);
+        if (!DMLC_IO_NO_ENDIAN_SWAP) {
+          dmlc::ByteSwap(dmlc::BeginPtr(temp_data_), elem_bytes, num_bytes / elem_bytes);
+        }
+        this->WriteArray(&temp_data_[0], num_bytes);
       } catch (const std::runtime_error &e) {
         RPCCode code = RPCCode::kException;
         this->Write(code);
@@ -597,6 +615,7 @@ class RPCSession::EventHandler : public dmlc::Stream {
       CHECK(this->Read(&copy_offset_));
       CHECK(this->Read(&copy_size_));
       CHECK(this->Read(&copy_ctx_));
+      CHECK(this->Read(&copy_dtype_));
       arg_recv_stage_ = 1;
       CHECK_EQ(pending_request_bytes_, 0U);
       this->RequestBytes(copy_size_);
@@ -607,12 +626,20 @@ class RPCSession::EventHandler : public dmlc::Stream {
       int ret_tcode = kNull;
       RPCCode code = RPCCode::kReturn;
       std::string errmsg;
+
+      size_t elem_bytes = (copy_dtype_.bits * copy_dtype_.lanes + 7) / 8;
       if (copy_ctx_.device_type == kDLCPU) {
-        this->ReadArray(
-            reinterpret_cast<char*>(copy_handle_) + copy_offset_, copy_size_);
+        char* dptr = reinterpret_cast<char*>(copy_handle_) + copy_offset_;
+        this->ReadArray(dptr, copy_size_);
+        if (!DMLC_IO_NO_ENDIAN_SWAP) {
+          dmlc::ByteSwap(dptr, elem_bytes, copy_size_ / elem_bytes);
+        }
       } else {
         temp_data_.resize(copy_size_ + 1);
         this->ReadArray(&temp_data_[0], copy_size_);
+        if (!DMLC_IO_NO_ENDIAN_SWAP) {
+          dmlc::ByteSwap(dmlc::BeginPtr(temp_data_), elem_bytes, copy_size_ / elem_bytes);
+        }
         try {
           TVMContext cpu_ctx;
           cpu_ctx.device_type = kDLCPU;
@@ -620,7 +647,7 @@ class RPCSession::EventHandler : public dmlc::Stream {
           DeviceAPI::Get(copy_ctx_)->CopyDataFromTo(
               temp_data_.data(), 0,
               reinterpret_cast<void*>(copy_handle_), copy_offset_,
-              copy_size_, cpu_ctx, copy_ctx_, nullptr);
+              copy_size_, cpu_ctx, copy_ctx_, copy_dtype_, nullptr);
         } catch (const std::runtime_error &e) {
           code = RPCCode::kException;
           errmsg = e.what();
@@ -873,7 +900,8 @@ void RPCSession::CopyToRemote(void* from,
                               void* to,
                               size_t to_offset,
                               size_t data_size,
-                              TVMContext ctx_to) {
+                              TVMContext ctx_to,
+                              TVMType type_hint) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   ctx_to = handler_->StripSessMask(ctx_to);
   RPCCode code = RPCCode::kCopyToRemote;
@@ -885,6 +913,7 @@ void RPCSession::CopyToRemote(void* from,
   uint64_t size = static_cast<uint64_t>(data_size);
   handler_->Write(size);
   handler_->Write(ctx_to);
+  handler_->Write(type_hint);
   handler_->WriteArray(reinterpret_cast<char*>(from) + from_offset, data_size);
   TVMRetValue rv;
   CHECK(HandleUntilReturnEvent(&rv, true, nullptr) == RPCCode::kReturn);
@@ -895,7 +924,8 @@ void RPCSession::CopyFromRemote(void* from,
                                 void* to,
                                 size_t to_offset,
                                 size_t data_size,
-                                TVMContext ctx_from) {
+                                TVMContext ctx_from,
+                                TVMType type_hint) {
   std::lock_guard<std::recursive_mutex> lock(mutex_);
   ctx_from = handler_->StripSessMask(ctx_from);
   RPCCode code = RPCCode::kCopyFromRemote;
@@ -907,6 +937,7 @@ void RPCSession::CopyFromRemote(void* from,
   uint64_t size = static_cast<uint64_t>(data_size);
   handler_->Write(size);
   handler_->Write(ctx_from);
+  handler_->Write(type_hint);
   TVMRetValue rv;
   CHECK(HandleUntilReturnEvent(&rv, true, nullptr) == RPCCode::kCopyAck);
   reader_.Reserve(data_size);
@@ -996,7 +1027,8 @@ void RPCCopyAmongRemote(TVMArgs args, TVMRetValue *rv) {
   uint64_t size = args[4];
   TVMContext ctx_from = args[5];
   TVMContext ctx_to = args[6];
-  TVMStreamHandle stream = args[7];
+  TVMType type_hint = args[7];
+  TVMStreamHandle stream = args[8];
   TVMContext ctx = ctx_from;
   if (ctx.device_type == kDLCPU) {
     ctx = ctx_to;
@@ -1008,7 +1040,7 @@ void RPCCopyAmongRemote(TVMArgs args, TVMRetValue *rv) {
   DeviceAPI::Get(ctx)->CopyDataFromTo(
       from, from_offset,
       to, to_offset,
-      size, ctx_from, ctx_to, stream);
+      size, ctx_from, ctx_to, type_hint, stream);
 }
 
 void RPCModuleLoad(TVMArgs args, TVMRetValue *rv) {
diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h
index 5fe453924..68f6763ae 100644
--- a/src/runtime/rpc/rpc_session.h
+++ b/src/runtime/rpc/rpc_session.h
@@ -116,30 +116,34 @@ class RPCSession {
    * \param from_offset The byte offeset in the from.
    * \param to The target array.
    * \param to_offset The byte offset in the to.
-   * \param size The size of the memory.
+   * \param nbytes The size of the memory in bytes.
    * \param ctx_to The target context.
+   * \param type_hint Hint of content data type.
    */
   void CopyToRemote(void* from,
                     size_t from_offset,
                     void* to,
                     size_t to_offset,
-                    size_t size,
-                    TVMContext ctx_to);
+                    size_t nbytes,
+                    TVMContext ctx_to,
+                    TVMType type_hint);
   /*!
    * \brief Copy bytes from remote array content.
    * \param from The source host data.
    * \param from_offset The byte offeset in the from.
    * \param to The target array.
    * \param to_offset The byte offset in the to.
-   * \param size The size of the memory.
+   * \param nbytes The size of the memory in bytes.
    * \param ctx_from The source context.
+   * \param type_hint Hint of content data type.
    */
   void CopyFromRemote(void* from,
                       size_t from_offset,
                       void* to,
                       size_t to_offset,
-                      size_t size,
-                      TVMContext ctx_from);
+                      size_t nbytes,
+                      TVMContext ctx_from,
+                      TVMType type_hint);
   /*!
    * \brief Get a remote timer function on ctx.
    *  This function consumes fhandle, caller should not call Free on fhandle.
diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h
index 49d60925e..0d02c0aaa 100644
--- a/src/runtime/vulkan/vulkan_common.h
+++ b/src/runtime/vulkan/vulkan_common.h
@@ -141,6 +141,7 @@ class VulkanWorkspace final : public DeviceAPI {
                       size_t size,
                       TVMContext ctx_from,
                       TVMContext ctx_to,
+                      TVMType type_hint,
                       TVMStreamHandle stream) final;
   void StreamSync(TVMContext ctx, TVMStreamHandle stream) final;
   void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final;
diff --git a/src/runtime/vulkan/vulkan_device_api.cc b/src/runtime/vulkan/vulkan_device_api.cc
index a3e4fc294..2abb0c277 100644
--- a/src/runtime/vulkan/vulkan_device_api.cc
+++ b/src/runtime/vulkan/vulkan_device_api.cc
@@ -131,6 +131,7 @@ void VulkanWorkspace::CopyDataFromTo(const void* from,
                                      size_t size,
                                      TVMContext ctx_from,
                                      TVMContext ctx_to,
+                                     TVMType type_hint,
                                      TVMStreamHandle stream) {
   this->Init();
   CHECK(stream == nullptr);
diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py
index 581890efe..a609f3139 100644
--- a/tests/python/unittest/test_runtime_rpc.py
+++ b/tests/python/unittest/test_runtime_rpc.py
@@ -1,9 +1,41 @@
 import tvm
+import os
 import logging
 import numpy as np
 import time
 from tvm.contrib import rpc, util
 
+
+def test_bigendian_rpc():
+    """Test big endian rpc when there is a PowerPC RPC server available"""
+    host = os.environ.get("TVM_POWERPC_TEST_HOST", None)
+    port = os.environ.get("TVM_POWERPC_TEST_PORT", 9090)
+    if host is None:
+        return
+    def verify_rpc(remote, target, shape, dtype):
+        A = tvm.placeholder(shape, dtype=dtype)
+        B = tvm.compute(A.shape, lambda i: A[i]+tvm.const(1, A.dtype))
+        s = tvm.create_schedule(B.op)
+        f = tvm.build(s, [A, B], target, name="myadd")
+
+        ctx = remote.cpu(0)
+        a = tvm.nd.array(np.random.randint(0, 256, size=shape).astype(A.dtype), ctx=ctx)
+        b = tvm.nd.array(np.zeros(shape).astype(A.dtype), ctx=ctx)
+        temp = util.tempdir()
+        path_dso = temp.relpath("dev_lib.o")
+        f.save(path_dso)
+        remote.upload(path_dso)
+        f = remote.load_module("dev_lib.o")
+        f(a, b)
+        np.testing.assert_allclose(a.asnumpy() + 1, b.asnumpy())
+
+    print("Test RPC connection to PowerPC...")
+    remote = rpc.connect(host, port)
+    target = "llvm -mtriple=powerpc-linux-gnu"
+    for dtype in ["float32", "float64", "int32", "int8"]:
+        verify_rpc(remote, target, (10,), dtype)
+
+
 def test_rpc_simple():
     if not tvm.module.enabled("rpc"):
         return
@@ -166,6 +198,7 @@ def test_local_func():
 
 if __name__ == "__main__":
     logging.basicConfig(level=logging.INFO)
+    test_bigendian_rpc()
     test_rpc_remote_module()
     test_rpc_return_func()
     test_rpc_file_exchange()
-- 
GitLab