diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index 6faf4d21de942bdded54e0aab100cd22a7ade0e6..33c46b40d670c46336e763ddf619a1e7a9e33452 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -373,34 +373,6 @@ TVM_DLL int TVMFuncListGlobalNames(int *out_size,
                                    const char*** out_array);
 
 // Array related apis for quick proptying
-/*!
- * \brief Initialize certain type of devices, this may
- *  not be necessary for all device types. But is needed for OpenCL.
- *
- * \param dev_mask The device mask of device type to be initialized
- * \param option_keys Additional option  keys to pass.
- * \param option_vals Additional option values to pass
- * \param num_options Number of options to be passed into it.
- * \param out_code 1: success, 0: already initialized
- * \return 0 when success, -1 when failure happens
- */
-TVM_DLL int TVMDeviceInit(int dev_mask,
-                          const char** option_keys,
-                          const char** option_vals,
-                          int num_options,
-                          int *out_code);
-
-
-/*!
- * \brief Whether the specified context is enabled.
- *
- * \param ctx The context to be checked.
- * \param out_enabled whether the ctx is enabled.
- * \return Whether the function is successful.
- */
-TVM_DLL int TVMContextEnabled(TVMContext ctx,
-                              int* out_enabled);
-
 /*!
  * \brief Allocate a nd-array's memory,
  *  including space of shape, of given spec.
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index 558dd9b9541ea4c0fbc1f20c3380b6ce63aa30e2..592b418e591855e77964fde7d9c0206d69ec00ba 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -535,8 +535,9 @@ inline const char* TypeCode2Str(int type_code) {
 }
 
 inline std::ostream& operator<<(std::ostream& os, TVMType t) {  // NOLINT(*)
-  os << TypeCode2Str(t.code)
-     << static_cast<int>(t.bits);
+  os << TypeCode2Str(t.code);
+  if (t.code == kHandle) return os;
+  os << static_cast<int>(t.bits);
   if (t.lanes != 1) {
     os << 'x' << static_cast<int>(t.lanes);
   }
@@ -559,7 +560,7 @@ inline TVMType String2TVMType(std::string s) {
     t.code = kUInt; scan = s.c_str() + 4;
   } else if (s.substr(0, 5) == "float") {
     t.code = kFloat; scan = s.c_str() + 5;
-  } else if (s == "handle") {
+  } else if (s.substr(0, 6) == "handle") {
     t.code = kHandle;
     t.bits = 64;  // handle uses 64 bit by default.
     scan = s.c_str() + 6;
diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py
index bd4ef614f5859c28ad78da0474151a85d634344b..4e294bdafdf296f379558b250c41a3005f861006 100644
--- a/python/tvm/__init__.py
+++ b/python/tvm/__init__.py
@@ -15,7 +15,7 @@ from . import schedule
 from . import module
 
 from . import ndarray as nd
-from .ndarray import cpu, gpu, opencl, init_opencl, cl
+from .ndarray import cpu, gpu, opencl, cl
 
 from ._base import TVMError
 from .api import *
diff --git a/python/tvm/_ctypes/_ndarray.py b/python/tvm/_ctypes/_ndarray.py
index b6fc4d4dfcacedbead5f87805955661e8b349966..91f800cdd5ba8d7966ce7c8f80b22f11ea87c3f1 100644
--- a/python/tvm/_ctypes/_ndarray.py
+++ b/python/tvm/_ctypes/_ndarray.py
@@ -7,10 +7,9 @@ import ctypes
 import numpy as np
 
 from .._base import _LIB, check_call
-from .._base import c_array, c_str
+from .._base import c_array
 from ._types import TVMType, tvm_index_t
 
-
 class TVMContext(ctypes.Structure):
     """TVM context strucure."""
     _fields_ = [("dev_mask", ctypes.c_int),
@@ -29,12 +28,6 @@ class TVMContext(ctypes.Structure):
         return "%s(%d)" % (
             TVMContext.MASK2STR[self.dev_mask], self.dev_id)
 
-    @property
-    def enabled(self):
-        ret = ctypes.c_int()
-        check_call(_LIB.TVMContextEnabled(self, ctypes.byref(ret)))
-        return ret.value != 0
-
 
 class TVMArray(ctypes.Structure):
     """TVMValue in C API"""
@@ -141,30 +134,6 @@ def sync(ctx):
     check_call(_LIB.TVMSynchronize(ctx, None))
 
 
-def init_opencl(**kwargs):
-    """Initialize the opencl with the options.
-
-    Parameters
-    ----------
-    kwargs : dict
-        The options
-    """
-    keys = []
-    vals = []
-    for k, v in kwargs.items():
-        keys.append(c_str(k))
-        vals.append(c_str(v))
-    dev_mask = ctypes.c_int(4)
-    out_code = ctypes.c_int()
-    check_call(_LIB.TVMDeviceInit(
-        dev_mask,
-        c_array(ctypes.c_char_p, keys),
-        c_array(ctypes.c_char_p, vals),
-        ctypes.c_int(len(keys)),
-        ctypes.byref(out_code)))
-    return out_code.value != 0
-
-
 class NDArrayBase(object):
     """A simple Device/CPU Array object in runtime."""
     __slots__ = ["handle"]
diff --git a/python/tvm/addon/testing.py b/python/tvm/addon/testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..642dadd37a6ade1a79a99cc7b629e48ee1c3d475
--- /dev/null
+++ b/python/tvm/addon/testing.py
@@ -0,0 +1,34 @@
+"""Utilities to make tempdir"""
+from __future__ import absolute_import as _abs
+import os
+import tempfile
+import shutil
+
+class TempDirectory(object):
+    """Helper object to manage temp directory during testing"""
+    def __init__(self):
+        self.temp_dir = tempfile.mkdtemp()
+
+    def __del__(self):
+        shutil.rmtree(self.temp_dir)
+
+    def relpath(self, name):
+        """Relative path in temp dir
+
+        Parameters
+        ----------
+        name : str
+            The name of the file.
+        """
+        return os.path.join(self.temp_dir, name)
+
+
+def tempdir():
+    """Return a new temp dir which deletes the contents when exit
+
+    Returns
+    -------
+    temp : TempDirectory
+        The temp directory object
+    """
+    return TempDirectory()
diff --git a/python/tvm/libinfo.py b/python/tvm/libinfo.py
index 43679dd73194e72995808e7f0fedd0b24af3e7ee..967f3e9c77ffda8c53c57dc7cc3328b63211bd9e 100644
--- a/python/tvm/libinfo.py
+++ b/python/tvm/libinfo.py
@@ -1,9 +1,11 @@
 # coding: utf-8
 """Information about nnvm."""
 from __future__ import absolute_import
+import sys
 import os
 import platform
 
+
 def find_lib_path():
     """Find dynamic library files.
 
@@ -12,6 +14,7 @@ def find_lib_path():
     lib_path : list(string)
         List of all found path to the libraries
     """
+    use_runtime = os.environ.get("TVM_USE_RUNTIME_LIB", False)
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
     api_path = os.path.join(curr_path, '../../lib/')
     cmake_build_path = os.path.join(curr_path, '../../build/Release/')
@@ -26,15 +29,24 @@ def find_lib_path():
             dll_path.append(os.path.join(curr_path, '../../windows', vs_configuration))
     elif os.name == "posix" and os.environ.get('LD_LIBRARY_PATH', None):
         dll_path.extend([p.strip() for p in os.environ['LD_LIBRARY_PATH'].split(":")])
+
     if os.name == 'nt':
-        dll_path = [os.path.join(p, 'libtvm.dll') for p in dll_path]
+        lib_dll_path = [os.path.join(p, 'libtvm.dll') for p in dll_path]
+        runtime_dll_path = [os.path.join(p, 'libtvm_runtime.dll') for p in dll_path]
     else:
-        dll_path = [os.path.join(p, 'libtvm.so') for p in dll_path]
-    lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
-    if len(lib_path) == 0:
+        lib_dll_path = [os.path.join(p, 'libtvm.so') for p in dll_path]
+        runtime_dll_path = [os.path.join(p, 'libtvm_runtime.so') for p in dll_path]
+
+    dll_path = runtime_dll_path if use_runtime else lib_dll_path
+    lib_found = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
+
+    if len(lib_found) == 0:
         raise RuntimeError('Cannot find the files.\n' +
                            'List of candidates:\n' + str('\n'.join(dll_path)))
-    return lib_path
+    if use_runtime:
+        sys.stderr.write("Loading runtime library... this is execution only\n")
+        sys.stderr.flush()
+    return lib_found
 
 
 # current version
diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py
index 2f2492eeafbcd5ee48d96deda577adb5c85b3c3c..d324a9ffddca90f9bd03b85a07cfd4c5c4dd7f24 100644
--- a/python/tvm/ndarray.py
+++ b/python/tvm/ndarray.py
@@ -9,7 +9,6 @@ import numpy as _np
 from ._ctypes._ndarray import TVMContext, TVMType, NDArrayBase
 from ._ctypes._ndarray import cpu, gpu, opencl, empty, sync
 from ._ctypes._ndarray import _init_ndarray_module
-from ._ctypes._ndarray import init_opencl
 from ._ctypes._function import Function
 
 cl = opencl
diff --git a/src/api/api_codegen.cc b/src/api/api_codegen.cc
index 4f267038a7382c25e20fd9b70e5203312623e77b..9616dccb306a80406aa4ab5ed27daad136ea318e 100644
--- a/src/api/api_codegen.cc
+++ b/src/api/api_codegen.cc
@@ -21,7 +21,7 @@ TVM_REGISTER_API(_codegen_build)
     }
   });
 
-TVM_REGISTER_API(_codegen_target_enabled)
+TVM_REGISTER_API(_codegen_enabled)
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = TargetEnabled(args[0]);
   });
diff --git a/src/codegen/build_cuda.cc b/src/codegen/build_cuda.cc
index 17dd4d0b183406399733714ebc63099602a116fb..a195cea8fa8dae9855671140c001c1176ae673b2 100644
--- a/src/codegen/build_cuda.cc
+++ b/src/codegen/build_cuda.cc
@@ -61,10 +61,13 @@ runtime::Module BuildCUDA(Array<LoweredFunc> funcs) {
   if (const auto* f = Registry::Get("tvm_callback_cuda_postproc")) {
     code = (*f)(code).operator std::string();
   }
-
+  std::string fmt = "ptx";
   std::string ptx;
   if (const auto* f = Registry::Get("tvm_callback_cuda_compile")) {
     ptx = (*f)(code).operator std::string();
+    // Dirty matching to check PTX vs cubin.
+    // TODO(tqchen) more reliable checks
+    if (ptx[0] != '/') fmt = "cubin";
   } else {
     ptx = NVRTCCompile(code);
   }
@@ -80,7 +83,7 @@ runtime::Module BuildCUDA(Array<LoweredFunc> funcs) {
     }
     fmap[f->name] = info;
   }
-  return CUDAModuleCreate(ptx, "ptx", fmap, code);
+  return CUDAModuleCreate(ptx, fmt, fmap, code);
 }
 
 TVM_REGISTER_API(_codegen_build_cuda)
diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc
index 925d1243da7af0cb0a19760e3cb127f71d7c1401..3ffe02ed518eb387d4e3508a03dc68df1a78f074 100644
--- a/src/runtime/c_runtime_api.cc
+++ b/src/runtime/c_runtime_api.cc
@@ -200,38 +200,6 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func,
   API_END();
 }
 
-int TVMDeviceInit(int dev_mask,
-                  const char** option_keys,
-                  const char** option_vals,
-                  int num_options,
-                  int* out_code) {
-  API_BEGIN();
-  *out_code = 1;
-  switch (dev_mask) {
-    case kOpenCL: {
-      *out_code = DeviceInit<kOpenCL>(option_keys, option_vals, num_options);
-      break;
-    }
-    default: break;
-  }
-  API_END();
-}
-
-int TVMContextEnabled(TVMContext ctx,
-                      int* out_enabled) {
-  API_BEGIN();
-  if (ctx.dev_mask == kGPU && TVM_CUDA_RUNTIME == 0) {
-    *out_enabled = 0;
-  } else if (ctx.dev_mask == kOpenCL && TVM_OPENCL_RUNTIME == 0) {
-    *out_enabled = 0;
-  } else {
-    TVM_DEVICE_SWITCH(ctx, {
-        *out_enabled = CheckEnabled<xpu>(ctx);
-      });
-  }
-  API_END();
-}
-
 int TVMArrayAlloc(const tvm_index_t* shape,
                   tvm_index_t ndim,
                   TVMType dtype,
diff --git a/src/runtime/cuda/cuda_module.cc b/src/runtime/cuda/cuda_module.cc
index 8b3a79c5a75d7f6fbb10edfcd145504fbfcbce0f..12bc1ca7fd40a5e5a7edbab79122d2112e00ab50 100644
--- a/src/runtime/cuda/cuda_module.cc
+++ b/src/runtime/cuda/cuda_module.cc
@@ -5,6 +5,7 @@
 #include "./cuda_module.h"
 
 #if TVM_CUDA_RUNTIME
+
 #include <tvm/runtime/registry.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -60,7 +61,12 @@ class CUDAModuleNode : public runtime::ModuleNode {
 
   void SaveToFile(const std::string& file_name,
                   const std::string& format) final {
-    LOG(FATAL) << "Not implemented";
+    std::string fmt = GetFileFormat(file_name, format);
+    CHECK_EQ(fmt, fmt_)
+        << "Can only save to format=" << fmt_;
+    std::string meta_file = GetMetaFilePath(file_name);
+    SaveMetaDataToFile(meta_file, fmap_);
+    SaveBinaryToFile(file_name, data_);
   }
 
   std::string GetSource(const std::string& format) final {
@@ -212,9 +218,13 @@ Module CUDAModuleCreate(
 // Load module from module.
 Module CUDAModuleLoad(const std::string& file_name,
                       const std::string& format) {
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
   std::string fmt = GetFileFormat(file_name, format);
-  std::string data = LoadBinaryFile(file_name);
-  return CUDAModuleCreate(data, fmt, {{}}, std::string());
+  std::string meta_file = GetMetaFilePath(file_name);
+  LoadBinaryFromFile(file_name, &data);
+  LoadMetaDataFromFile(meta_file, &fmap);
+  return CUDAModuleCreate(data, fmt, fmap, std::string());
 }
 
 TVM_REGISTER_GLOBAL(_module_loadfile_cubin)
diff --git a/src/runtime/device_api.h b/src/runtime/device_api.h
index f551ca9ee8c840041de92a9ef55c37d6442758a0..82e998fcff3756390466797226916e9441d1c847 100644
--- a/src/runtime/device_api.h
+++ b/src/runtime/device_api.h
@@ -11,31 +11,6 @@
 
 namespace tvm {
 namespace runtime {
-/*!
- * \brief Initialize the device.
- * \param option_keys Additional option  keys to pass.
- * \param option_vals Additional option values to pass
- * \param num_options Number of options to be passed into it.
- * \return 0 if success, 1: if already initialized
- * \tparam xpu The device mask.
- */
-template<TVMDeviceMask xpu>
-inline bool DeviceInit(const char** option_keys,
-                       const char** option_vals,
-                       int num_options) {
-  return true;
-}
-
-/*!
- * \brief Whether ctx is enabled.
- * \param ctx The device context to perform operation.
- * \tparam xpu The device mask.
- */
-template<TVMDeviceMask xpu>
-inline bool CheckEnabled(TVMContext ctx) {
-  return true;
-}
-
 /*!
  * \brief Allocate a data space on device.
  * \param ctx The device context to perform operation.
diff --git a/src/runtime/file_util.cc b/src/runtime/file_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d69b39be1b1022ae453bf74234253663467d51e4
--- /dev/null
+++ b/src/runtime/file_util.cc
@@ -0,0 +1,112 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file file_util.cc
+ */
+#include <dmlc/json.h>
+#include <dmlc/logging.h>
+#include <tvm/runtime/packed_func.h>
+#include <fstream>
+#include "./file_util.h"
+
+namespace tvm {
+namespace runtime {
+
+void FunctionInfo::Save(dmlc::JSONWriter* writer) const {
+  std::vector<std::string> sarg_types(arg_types.size());
+  for (size_t i = 0; i < arg_types.size(); ++i) {
+    sarg_types[i] = TVMType2String(arg_types[i]);
+  }
+  writer->BeginObject();
+  writer->WriteObjectKeyValue("name", name);
+  writer->WriteObjectKeyValue("arg_types", sarg_types);
+  writer->WriteObjectKeyValue("thread_axis_tags", thread_axis_tags);
+  writer->EndObject();
+}
+
+void FunctionInfo::Load(dmlc::JSONReader* reader) {
+  dmlc::JSONObjectReadHelper helper;
+  std::vector<std::string> sarg_types;
+  helper.DeclareField("name", &name);
+  helper.DeclareField("arg_types", &sarg_types);
+  helper.DeclareField("thread_axis_tags", &thread_axis_tags);
+  helper.ReadAllFields(reader);
+  arg_types.resize(sarg_types.size());
+  for (size_t i = 0; i < arg_types.size(); ++i) {
+    arg_types[i] = String2TVMType(sarg_types[i]);
+  }
+}
+
+std::string GetFileFormat(const std::string& file_name,
+                          const std::string& format) {
+  std::string fmt = format;
+  if (fmt.length() == 0) {
+    size_t pos = file_name.find_last_of(".");
+    if (pos != std::string::npos) {
+      return file_name.substr(pos + 1, file_name.length() - pos - 1);
+    } else {
+      return "";
+    }
+  } else {
+    return format;
+  }
+}
+
+std::string GetMetaFilePath(const std::string& file_name) {
+  size_t pos  = file_name.find_last_of(".");
+  if (pos != std::string::npos) {
+    return file_name.substr(0, pos) + ".tvm_meta.json";
+  } else {
+    return file_name + ".tvm_meta.json";
+  }
+}
+
+void LoadBinaryFromFile(const std::string& file_name,
+                        std::string* data) {
+  std::ifstream fs(file_name, std::ios::in | std::ios::binary);
+  CHECK(!fs.fail()) << "Cannot open " << file_name;
+  // get its size:
+  fs.seekg(0, std::ios::end);
+  size_t size = fs.tellg();
+  fs.seekg(0, std::ios::beg);
+  data->resize(size);
+  fs.read(&(*data)[0], size);
+}
+
+void SaveBinaryToFile(
+    const std::string& file_name,
+    const std::string& data) {
+  std::ofstream fs(file_name, std::ios::out | std::ios::binary);
+  CHECK(!fs.fail()) << "Cannot open " << file_name;
+  fs.write(&data[0], data.length());
+}
+
+void SaveMetaDataToFile(
+    const std::string& file_name,
+    const std::unordered_map<std::string, FunctionInfo>& fmap) {
+  std::string version = "0.1.0";
+  std::ofstream fs(file_name.c_str());
+  CHECK(!fs.fail()) << "Cannot open file " << file_name;
+  dmlc::JSONWriter writer(&fs);
+  writer.BeginObject();
+  writer.WriteObjectKeyValue("tvm_version", version);
+  writer.WriteObjectKeyValue("func_info", fmap);
+  writer.EndObject();
+  fs.close();
+}
+
+void LoadMetaDataFromFile(
+    const std::string& file_name,
+    std::unordered_map<std::string, FunctionInfo>* fmap) {
+  std::ifstream fs(file_name.c_str());
+  CHECK(!fs.fail()) << "Cannot open file " << file_name;
+  std::string version;
+  dmlc::JSONReader reader(&fs);
+  dmlc::JSONObjectReadHelper helper;
+  helper.DeclareField("tvm_version", &version);
+  helper.DeclareField("func_info", fmap);
+  helper.ReadAllFields(&reader);
+  fs.close();
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/file_util.h b/src/runtime/file_util.h
index 15759130682d72a989171ed717854baeb6dc1b1b..6f7d638c349732912ecd40f2fc6dd758e42034b4 100644
--- a/src/runtime/file_util.h
+++ b/src/runtime/file_util.h
@@ -6,9 +6,8 @@
 #ifndef TVM_RUNTIME_FILE_UTIL_H_
 #define TVM_RUNTIME_FILE_UTIL_H_
 
-#include <dmlc/logging.h>
-#include <fstream>
 #include <string>
+#include "./meta_data.h"
 
 namespace tvm {
 namespace runtime {
@@ -17,39 +16,48 @@ namespace runtime {
  * \param file_name The name of the file.
  * \param format The format of the file.
  */
-inline std::string GetFileFormat(const std::string& file_name,
-                                 const std::string& format) {
-  std::string fmt = format;
-  if (fmt.length() == 0) {
-    size_t pos = file_name.find_last_of(".");
-    if (pos != std::string::npos) {
-      return file_name.substr(pos + 1, file_name.length() - pos - 1);
-    } else {
-      return "";
-    }
-  } else {
-    return format;
-  }
-}
+std::string GetFileFormat(const std::string& file_name,
+                          const std::string& format);
+
+/*!
+ * \brief Get meta file path given file name and format.
+ * \param file_name The name of the file.
+ */
+std::string GetMetaFilePath(const std::string& file_name);
+
+/*!
+ * \brief Load binary file into a in-memory buffer.
+ * \param file_name The name of the file.
+ * \param data The data to be loaded.
+ */
+void LoadBinaryFromFile(const std::string& file_name,
+                        std::string* data);
 
 /*!
  * \brief Load binary file into a in-memory buffer.
  * \param file_name The name of the file.
+ * \param The binary
+ */
+void SaveBinaryToFile(const std::string& file_name,
+                      const std::string& data);
+
+/*!
+ * \brief Save meta data to file.
+ * \param file_name The name of the file.
+ * \param fmap The function info map.
  */
-inline std::string LoadBinaryFile(const std::string& file_name) {
-  std::ifstream fs(file_name, std::ios::in | std::ios::binary);
-  CHECK(!fs.fail())
-      << "Cannot open " << file_name;
-  // get its size:
-  fs.seekg(0, std::ios::end);
-  size_t size = fs.tellg();
-  fs.seekg(0, std::ios::beg);
-  std::string data;
-  data.resize(size);
-  fs.read(&data[0], size);
-  return data;
-}
+void SaveMetaDataToFile(
+    const std::string& file_name,
+    const std::unordered_map<std::string, FunctionInfo>& fmap);
 
+/*!
+ * \brief Load meta data to file.
+ * \param file_name The name of the file.
+ * \param fmap The function info map.
+ */
+void LoadMetaDataFromFile(
+    const std::string& file_name,
+    std::unordered_map<std::string, FunctionInfo>* fmap);
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_FILE_UTIL_H_
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index 3e8a269351d0a06fc2d61c6aa65a3f8a88ebb506..69e6f6a50529cc07c780d8d92e76d23cfc6201ec 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -27,30 +27,8 @@ struct FunctionInfo {
   std::vector<TVMType> arg_types;
   std::vector<std::string> thread_axis_tags;
 
-  void Save(dmlc::JSONWriter *writer) const {
-    std::vector<std::string> sarg_types(arg_types.size());
-    for (size_t i = 0; i < arg_types.size(); ++i) {
-      sarg_types[i] = TVMType2String(arg_types[i]);
-    }
-    writer->BeginObject();
-    writer->WriteObjectKeyValue("name", name);
-    writer->WriteObjectKeyValue("arg_types", sarg_types);
-    writer->WriteObjectKeyValue("thread_axis_tags", thread_axis_tags);
-    writer->EndObject();
-  }
-
-  void Load(dmlc::JSONReader *reader) {
-    dmlc::JSONObjectReadHelper helper;
-    std::vector<std::string> sarg_types;
-    helper.DeclareField("name", &name);
-    helper.DeclareField("arg_types", &sarg_types);
-    helper.DeclareField("thread_axis_tags", &thread_axis_tags);
-    helper.ReadAllFields(reader);
-    arg_types.resize(sarg_types.size());
-    for (size_t i = 0; i < arg_types.size(); ++i) {
-      arg_types[i] = String2TVMType(sarg_types[i]);
-    }
-  }
+  void Save(dmlc::JSONWriter *writer) const;
+  void Load(dmlc::JSONReader *reader);
 };
 
 }  // namespace runtime
diff --git a/src/runtime/module.cc b/src/runtime/module.cc
index 4b4ffea94d11115292ecbdeeef3cc9a6f8b2f2e8..0b630c1fa870f562d5e35ef6a814e7c4ea7a90cc 100644
--- a/src/runtime/module.cc
+++ b/src/runtime/module.cc
@@ -83,6 +83,25 @@ const PackedFunc* ModuleNode::GetFuncFromEnv(const std::string& name) {
   }
 }
 
+bool RuntimeEnabled(const std::string& target) {
+  std::string load_f_name;
+  if (target == "cpu") {
+    return true;
+  } else if (target == "cuda" || target == "gpu") {
+    load_f_name = "_module_loadfile_ptx";
+  } else if (target == "cl" || target == "opencl") {
+    load_f_name = "_module_loadfile_cl";
+  } else {
+    LOG(FATAL) << "Unknown optional runtime " << target;
+  }
+  return runtime::Registry::Get(load_f_name) != nullptr;
+}
+
+TVM_REGISTER_GLOBAL(_module_enabled)
+.set_body([](TVMArgs args, TVMRetValue *ret) {
+    *ret = RuntimeEnabled(args[0]);
+    });
+
 TVM_REGISTER_GLOBAL(_module__GetSource)
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = args[0].operator Module()->GetSource(args[1]);
diff --git a/src/runtime/opencl/device_api_opencl.h b/src/runtime/opencl/device_api_opencl.h
index 3d2c2d1b458d654fd4288bcc210418cac3515f60..792682bbe7fe6c570a533c462255f33dc491a065 100644
--- a/src/runtime/opencl/device_api_opencl.h
+++ b/src/runtime/opencl/device_api_opencl.h
@@ -15,121 +15,6 @@
 
 namespace tvm {
 namespace runtime {
-namespace cl {
-
-inline std::string GetPlatformInfo(
-    cl_platform_id pid, cl_platform_info param_name) {
-  size_t ret_size;
-  OPENCL_CALL(clGetPlatformInfo(pid, param_name, 0, nullptr, &ret_size));
-  std::string ret;
-  ret.resize(ret_size);
-  OPENCL_CALL(clGetPlatformInfo(pid, param_name, ret_size, &ret[0], nullptr));
-  return ret;
-}
-
-inline std::string GetDeviceInfo(
-    cl_device_id pid, cl_device_info param_name) {
-  size_t ret_size;
-  OPENCL_CALL(clGetDeviceInfo(pid, param_name, 0, nullptr, &ret_size));
-  std::string ret;
-  ret.resize(ret_size);
-  OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, &ret[0], nullptr));
-  return ret;
-}
-
-inline std::vector<cl_platform_id> GetPlatformIDs() {
-  cl_uint ret_size;
-  OPENCL_CALL(clGetPlatformIDs(0, nullptr, &ret_size));
-  std::vector<cl_platform_id> ret;
-  ret.resize(ret_size);
-  OPENCL_CALL(clGetPlatformIDs(ret_size, &ret[0], nullptr));
-  return ret;
-}
-
-inline std::vector<cl_device_id> GetDeviceIDs(
-    cl_platform_id pid, std::string device_type) {
-  cl_device_type dtype = CL_DEVICE_TYPE_ALL;
-  if (device_type == "cpu") dtype = CL_DEVICE_TYPE_CPU;
-  if (device_type == "gpu") dtype = CL_DEVICE_TYPE_CPU;
-  if (device_type == "accelerator") dtype = CL_DEVICE_TYPE_ACCELERATOR;
-  cl_uint ret_size;
-  OPENCL_CALL(clGetDeviceIDs(pid, dtype, 0, nullptr, &ret_size));
-  std::vector<cl_device_id> ret;
-  ret.resize(ret_size);
-  OPENCL_CALL(clGetDeviceIDs(pid, dtype, ret_size, &ret[0], nullptr));
-  return ret;
-}
-
-inline bool MatchPlatformInfo(
-    cl_platform_id pid,
-    cl_platform_info param_name,
-    std::string value) {
-  if (value.length() == 0) return true;
-  std::string param_value = GetPlatformInfo(pid, param_name);
-  return param_value.find(value) != std::string::npos;
-}
-
-}  // namespace cl
-
-template<>
-inline bool DeviceInit<kOpenCL>(const char** option_keys,
-                                const char** option_vals,
-                                int num_options) {
-  cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
-  std::lock_guard<std::mutex>(w->mu);
-  if (w->initialized()) return false;
-  // matching conditions
-  std::string platform_name, device_type;
-  for (int i = 0; i < num_options; ++i) {
-    std::string key = option_keys[i];
-    std::string val = option_vals[i];
-    if (key == "platform_name") {
-      platform_name = val;
-    } else if (key == "device_type") {
-      device_type = val;
-    } else {
-      LOG(FATAL) << "unknown DeviceInit option " << key;
-    }
-  }
-  // matched platforms
-  std::vector<cl_platform_id> platform_matched;
-  for (cl_platform_id pid : cl::GetPlatformIDs()) {
-    bool matched = true;
-    if (!cl::MatchPlatformInfo(pid, CL_PLATFORM_NAME, platform_name)) matched = false;
-    if (matched) platform_matched.push_back(pid);
-  }
-  if (platform_matched.size() == 0) {
-    LOG(FATAL) << "No OpenCL platform matched given existing options ...";
-  }
-  if (platform_matched.size() > 1) {
-    LOG(WARNING) << "Multiple OpenCL platforms matched, use the first one ... ";
-  }
-  w->platform_id = platform_matched[0];
-
-  LOG(INFO) << "Initialize OpenCL platform \'"
-            << cl::GetPlatformInfo(w->platform_id, CL_PLATFORM_NAME) << '\'';
-  std::vector<cl_device_id> devices_matched =
-      cl::GetDeviceIDs(w->platform_id, device_type);
-  CHECK_GT(devices_matched.size(), 0U)
-      << "No OpenCL device any device matched given the options";
-  w->devices = devices_matched;
-  cl_int err_code;
-  w->context = clCreateContext(
-      nullptr, w->devices.size(), &(w->devices[0]),
-      nullptr, nullptr, &err_code);
-  OPENCL_CHECK_ERROR(err_code);
-  CHECK_EQ(w->queues.size(), 0U);
-  for (size_t i = 0; i < w->devices.size(); ++i) {
-    cl_device_id did = w->devices[i];
-    w->queues.push_back(
-        clCreateCommandQueue(w->context, did, 0, &err_code));
-    OPENCL_CHECK_ERROR(err_code);
-    LOG(INFO) << "opencl(" << i
-              << ")=\'" << cl::GetDeviceInfo(did, CL_DEVICE_NAME)
-              << "\' cl_device_id=" << did;
-  }
-  return true;
-}
 
 template<>
 inline void* AllocDataSpace<kOpenCL>(TVMContext ctx, size_t size, size_t alignment) {
diff --git a/src/runtime/opencl/opencl_module.cc b/src/runtime/opencl/opencl_module.cc
index 6c0b4619bd1de4e3abb0b5c0ee76d77020823c88..ad3f2620eb0c5d577e74c081093a6022f0711373 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -7,11 +7,14 @@
 
 #if TVM_OPENCL_RUNTIME
 
+#include <tvm/runtime/registry.h>
 #include <vector>
 #include <string>
 #include <unordered_map>
 #include "../void_addr_args.h"
 #include "../thread_storage_scope.h"
+#include "../meta_data.h"
+#include "../file_util.h"
 
 namespace tvm {
 namespace runtime {
@@ -67,7 +70,12 @@ class OpenCLModuleNode : public ModuleNode {
 
   void SaveToFile(const std::string& file_name,
                   const std::string& format) final {
-    LOG(FATAL) << "Not implemented";
+    std::string fmt = GetFileFormat(file_name, format);
+    CHECK_EQ(fmt, fmt_)
+        << "Can only save to format=" << fmt_;
+    std::string meta_file = GetMetaFilePath(file_name);
+    SaveMetaDataToFile(meta_file, fmap_);
+    SaveBinaryToFile(file_name, data_);
   }
 
   std::string GetSource(const std::string& format) final {
@@ -294,6 +302,27 @@ Module OpenCLModuleCreate(
   return Module(n);
 }
 
+// Load module from module.
+Module OpenCLModuleLoad(const std::string& file_name,
+                        const std::string& format) {
+  std::string data;
+  std::unordered_map<std::string, FunctionInfo> fmap;
+  std::string fmt = GetFileFormat(file_name, format);
+  std::string meta_file = GetMetaFilePath(file_name);
+  LoadBinaryFromFile(file_name, &data);
+  LoadMetaDataFromFile(meta_file, &fmap);
+  return OpenCLModuleCreate(data, fmt, fmap);
+}
+
+TVM_REGISTER_GLOBAL(_module_loadfile_cl)
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = OpenCLModuleLoad(args[0], args[1]);
+  });
+
+TVM_REGISTER_GLOBAL(_module_loadfile_clbin)
+.set_body([](TVMArgs args, TVMRetValue* rv) {
+    *rv = OpenCLModuleLoad(args[0], args[1]);
+  });
 }  // namespace runtime
 }  // namespace tvm
 
diff --git a/src/runtime/opencl/opencl_workspace.cc b/src/runtime/opencl/opencl_workspace.cc
index 1f79f4280bb6b7884d59ea7804eaf3e039dd76ed..1a89efbbccd6447eaf2e8b0ed9ec57f58ce37ca6 100644
--- a/src/runtime/opencl/opencl_workspace.cc
+++ b/src/runtime/opencl/opencl_workspace.cc
@@ -6,6 +6,7 @@
 
 #if TVM_OPENCL_RUNTIME
 
+#include <tvm/runtime/registry.h>
 #include <dmlc/thread_local.h>
 
 namespace tvm {
@@ -23,6 +24,123 @@ OpenCLThreadEntry* OpenCLThreadEntry::ThreadLocal() {
   return OpenCLThreadStore::Get();
 }
 
+std::string GetPlatformInfo(
+    cl_platform_id pid, cl_platform_info param_name) {
+  size_t ret_size;
+  OPENCL_CALL(clGetPlatformInfo(pid, param_name, 0, nullptr, &ret_size));
+  std::string ret;
+  ret.resize(ret_size);
+  OPENCL_CALL(clGetPlatformInfo(pid, param_name, ret_size, &ret[0], nullptr));
+  return ret;
+}
+
+std::string GetDeviceInfo(
+    cl_device_id pid, cl_device_info param_name) {
+  size_t ret_size;
+  OPENCL_CALL(clGetDeviceInfo(pid, param_name, 0, nullptr, &ret_size));
+  std::string ret;
+  ret.resize(ret_size);
+  OPENCL_CALL(clGetDeviceInfo(pid, param_name, ret_size, &ret[0], nullptr));
+  return ret;
+}
+
+std::vector<cl_platform_id> GetPlatformIDs() {
+  cl_uint ret_size;
+  OPENCL_CALL(clGetPlatformIDs(0, nullptr, &ret_size));
+  std::vector<cl_platform_id> ret;
+  ret.resize(ret_size);
+  OPENCL_CALL(clGetPlatformIDs(ret_size, &ret[0], nullptr));
+  return ret;
+}
+
+std::vector<cl_device_id> GetDeviceIDs(
+    cl_platform_id pid, std::string device_type) {
+  cl_device_type dtype = CL_DEVICE_TYPE_ALL;
+  if (device_type == "cpu") dtype = CL_DEVICE_TYPE_CPU;
+  if (device_type == "gpu") dtype = CL_DEVICE_TYPE_CPU;
+  if (device_type == "accelerator") dtype = CL_DEVICE_TYPE_ACCELERATOR;
+  cl_uint ret_size;
+  OPENCL_CALL(clGetDeviceIDs(pid, dtype, 0, nullptr, &ret_size));
+  std::vector<cl_device_id> ret;
+  ret.resize(ret_size);
+  OPENCL_CALL(clGetDeviceIDs(pid, dtype, ret_size, &ret[0], nullptr));
+  return ret;
+}
+
+bool MatchPlatformInfo(
+    cl_platform_id pid,
+    cl_platform_info param_name,
+    std::string value) {
+  if (value.length() == 0) return true;
+  std::string param_value = GetPlatformInfo(pid, param_name);
+  return param_value.find(value) != std::string::npos;
+}
+
+bool InitOpenCL(TVMArgs args, TVMRetValue* rv) {
+  cl::OpenCLWorkspace* w = cl::OpenCLWorkspace::Global();
+  std::lock_guard<std::mutex>(w->mu);
+  if (w->initialized()) return false;
+  // matching conditions
+  std::string platform_name, device_type;
+
+  for (size_t i = 0; i < args.num_args; ++i) {
+    std::string arg = args[i];
+    size_t pos = arg.find_first_of('=');
+    CHECK_EQ(pos, std::string::npos)
+        << "Argumentes need to be key=value";
+    std::string key = arg.substr(0, pos);
+    std::string val = arg.substr(pos + 1, arg.length() - pos - 1);
+    if (key == "platform_name") {
+      platform_name = val;
+    } else if (key == "device_type") {
+      device_type = val;
+    } else {
+      LOG(FATAL) << "unknown DeviceInit option " << key;
+    }
+  }
+  // matched platforms
+  std::vector<cl_platform_id> platform_matched;
+  for (cl_platform_id pid : cl::GetPlatformIDs()) {
+    bool matched = true;
+    if (!cl::MatchPlatformInfo(pid, CL_PLATFORM_NAME, platform_name)) matched = false;
+    if (matched) platform_matched.push_back(pid);
+  }
+  if (platform_matched.size() == 0) {
+    LOG(FATAL) << "No OpenCL platform matched given existing options ...";
+  }
+  if (platform_matched.size() > 1) {
+    LOG(WARNING) << "Multiple OpenCL platforms matched, use the first one ... ";
+  }
+  w->platform_id = platform_matched[0];
+
+  LOG(INFO) << "Initialize OpenCL platform \'"
+            << cl::GetPlatformInfo(w->platform_id, CL_PLATFORM_NAME) << '\'';
+  std::vector<cl_device_id> devices_matched =
+      cl::GetDeviceIDs(w->platform_id, device_type);
+  CHECK_GT(devices_matched.size(), 0U)
+      << "No OpenCL device any device matched given the options";
+  w->devices = devices_matched;
+  cl_int err_code;
+  w->context = clCreateContext(
+      nullptr, w->devices.size(), &(w->devices[0]),
+      nullptr, nullptr, &err_code);
+  OPENCL_CHECK_ERROR(err_code);
+  CHECK_EQ(w->queues.size(), 0U);
+  for (size_t i = 0; i < w->devices.size(); ++i) {
+    cl_device_id did = w->devices[i];
+    w->queues.push_back(
+        clCreateCommandQueue(w->context, did, 0, &err_code));
+    OPENCL_CHECK_ERROR(err_code);
+    LOG(INFO) << "opencl(" << i
+              << ")=\'" << cl::GetDeviceInfo(did, CL_DEVICE_NAME)
+              << "\' cl_device_id=" << did;
+  }
+  return true;
+}
+
+TVM_REGISTER_GLOBAL(_module_init_opencl)
+.set_body(InitOpenCL);
+
 }  // namespace cl
 }  // namespace runtime
 }  // namespace tvm
diff --git a/tests/python/integration/test_ewise.py b/tests/python/integration/test_ewise.py
index b87000f3f5d9889e5ed7f90a724865dfd376b5be..75798f0c4c2a3675091d63705f42cc100bf6612e 100644
--- a/tests/python/integration/test_ewise.py
+++ b/tests/python/integration/test_ewise.py
@@ -20,9 +20,9 @@ def test_add():
 
     # one line to build the function.
     def check_device(device, host="stackvm"):
-        if not tvm.codegen.target_enabled(host):
+        if not tvm.codegen.enabled(host):
             return
-        if not tvm.codegen.target_enabled(device):
+        if not tvm.codegen.enabled(device):
             return
         fadd = tvm.build(s, [A, B, C],
                          device, host,
@@ -37,7 +37,8 @@ def test_add():
         np.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + b.asnumpy())
 
-    tvm.init_opencl()
+    if tvm.module.enabled("opencl"):
+        tvm.module.init_opencl()
     check_device("cuda", "llvm")
     check_device("opencl")
 
diff --git a/tests/python/integration/test_gemm.py b/tests/python/integration/test_gemm.py
index 9beaa5b516c91958cb6efe1d6337089c86704f75..e557ac0ff807b00212f0fcd61e01a8e5858fe3da 100644
--- a/tests/python/integration/test_gemm.py
+++ b/tests/python/integration/test_gemm.py
@@ -54,9 +54,9 @@ def test_gemm():
 
     # one line to build the function.
     def check_device(device, host="stackvm"):
-        if not tvm.codegen.target_enabled(host):
+        if not tvm.codegen.enabled(host):
             return
-        if not tvm.codegen.target_enabled(device):
+        if not tvm.codegen.enabled(device):
             return
 
         f = tvm.build(s, [A, B, C], device, host,
@@ -76,8 +76,9 @@ def test_gemm():
         np.testing.assert_allclose(
             c.asnumpy(), np.dot(a_np, b_np.T), rtol=1e-5)
 
+    if tvm.module.enabled("opencl"):
+        tvm.module.init_opencl()
     check_device("cuda")
-    tvm.init_opencl()
     check_device("opencl")
 
 if __name__ == "__main__":
diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py
index 499a4606b9d2ab4c992ddec77e0be01fc62f1c02..7f2950b4469ea056e7841a809209b618ad6a4b62 100644
--- a/tests/python/integration/test_reduce.py
+++ b/tests/python/integration/test_reduce.py
@@ -19,9 +19,9 @@ def test_sum():
 
     # one line to build the function.
     def check_device(device, host="stackvm"):
-        if not tvm.codegen.target_enabled(host):
+        if not tvm.codegen.enabled(host):
             return
-        if not tvm.codegen.target_enabled(device):
+        if not tvm.codegen.enabled(device):
             return
         ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
         fsum = tvm.build(s,
@@ -37,7 +37,9 @@ def test_sum():
         np.testing.assert_allclose(
             b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-4)
 
-    tvm.init_opencl()
+    if tvm.module.enabled("opencl"):
+        tvm.module.init_opencl()
+
     check_device("cuda")
     check_device("opencl")
 
diff --git a/tests/python/integration/test_scan.py b/tests/python/integration/test_scan.py
index d194c9a62d986f0a720604ca4875db4f3ec98c48..ca0e169d8e33f63989cdceda1f6fca24b4f89f5b 100644
--- a/tests/python/integration/test_scan.py
+++ b/tests/python/integration/test_scan.py
@@ -23,9 +23,9 @@ def test_scan():
 
     # one line to build the function.
     def check_device(device, host="stackvm"):
-        if not tvm.codegen.target_enabled(host):
+        if not tvm.codegen.enabled(host):
             return
-        if not tvm.codegen.target_enabled(device):
+        if not tvm.codegen.enabled(device):
             return
         fscan = tvm.build(s, [X, res],
                           device, host,
@@ -41,7 +41,9 @@ def test_scan():
         np.testing.assert_allclose(
             b.asnumpy(), np.cumsum(a_np, axis=0))
 
-    tvm.init_opencl()
+    if tvm.module.enabled("opencl"):
+        tvm.module.init_opencl()
+
     check_device("cuda")
     check_device("opencl")
 
diff --git a/tests/python/unittest/test_codegen_device.py b/tests/python/unittest/test_codegen_device.py
index 171cc082677dffbd74ac1c115468b501342d149d..42f60cdef979b32dee0128281e4540104f88918e 100644
--- a/tests/python/unittest/test_codegen_device.py
+++ b/tests/python/unittest/test_codegen_device.py
@@ -1,4 +1,5 @@
 import tvm
+from tvm.addon import testing
 import numpy as np
 
 def test_add_pipeline():
@@ -27,9 +28,9 @@ def test_add_pipeline():
     fsplits = tvm.ir_pass.SplitHostDevice(fapi)
 
     def check_target(device, host="stackvm"):
-        if not tvm.codegen.target_enabled(host):
+        if not tvm.codegen.enabled(host):
             return
-        if not tvm.codegen.target_enabled(device):
+        if not tvm.codegen.enabled(device):
             return
         ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
         mhost = tvm.codegen.build(fsplits[0], host)
@@ -47,8 +48,33 @@ def test_add_pipeline():
         np.testing.assert_allclose(
             c.asnumpy(), a.asnumpy() + b.asnumpy())
 
+    def check_module_save(device, host="stackvm"):
+        if not tvm.codegen.enabled(host):
+            return
+        if not tvm.codegen.enabled(device):
+            return
+        ctx = tvm.gpu(0) if device == "cuda" else tvm.cl(0)
+        fmt = "ptx" if device == "cuda" else "cl"
+        mhost = tvm.codegen.build(fsplits[0], host)
+        mdev = tvm.codegen.build(fsplits[1:], device)
+        temp = testing.tempdir()
+        mpath = temp.relpath("test.%s" % fmt)
+        mdev.save(mpath)
+        mdev2 = tvm.module.load(mpath)
+        mhost.import_module(mdev2)
+        f = mhost.entry_func
+        # launch the kernel.
+        n = 1027
+        a = tvm.nd.array(np.random.uniform(size=n).astype(Ab.dtype), ctx)
+        b = tvm.nd.array(np.random.uniform(size=n).astype(Bb.dtype), ctx)
+        c = tvm.nd.array(np.zeros(n, dtype=Cb.dtype), ctx)
+        f(a, b, c)
+        np.testing.assert_allclose(
+            c.asnumpy(), a.asnumpy() + b.asnumpy())
+
     check_target("cuda", host="stackvm")
     check_target("cuda", host="llvm")
+    check_module_save("cuda", host="stackvm")
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_codegen_stack_llvm.py b/tests/python/unittest/test_codegen_stack_llvm.py
index 019f5fbf4e8ed72a7bbc7416eaa8c950662d510d..caaa056baa01219fb8399e563d80321f0467f75b 100644
--- a/tests/python/unittest/test_codegen_stack_llvm.py
+++ b/tests/python/unittest/test_codegen_stack_llvm.py
@@ -8,7 +8,7 @@ def tvm_call_packed(*args):
 
 def run_jit(fapi, check):
     for target in ["llvm", "stackvm"]:
-        if not tvm.codegen.target_enabled(target):
+        if not tvm.codegen.enabled(target):
             continue
         f = tvm.codegen.build(fapi, target)
         s = f.get_source()
@@ -95,7 +95,7 @@ def test_llvm_add_pipeline():
     fapi = tvm.ir_pass.MakeAPI(stmt, "myadd", [Ab, Bb, Cb], 0)
 
     def check_llvm():
-        if not tvm.codegen.target_enabled("llvm"):
+        if not tvm.codegen.enabled("llvm"):
             return
         # build and invoke the kernel.
         f = tvm.codegen.build(fapi, "llvm")
diff --git a/tests/python/unittest/test_module_load.py b/tests/python/unittest/test_module_load.py
index 89e015c28da68eb5c9987498e0e831729a1c7d12..59b5391b1ee45710992653d007fcedf7337b83e5 100644
--- a/tests/python/unittest/test_module_load.py
+++ b/tests/python/unittest/test_module_load.py
@@ -1,14 +1,29 @@
 import tvm
-from tvm.addon import cc_compiler as cc
+from tvm.addon import cc_compiler as cc, testing
 import os
-import tempfile
 import numpy as np
+import subprocess
+
+runtime_py = """
+import os
+import sys
+os.environ["TVM_USE_RUNTIME_LIB"] = "1"
+import tvm
+import numpy as np
+path_dso = sys.argv[1]
+dtype = sys.argv[2]
+ff = tvm.module.load(path_dso)
+a = tvm.nd.array(np.zeros(10, dtype=dtype))
+ff(a)
+np.testing.assert_equal(a.asnumpy(), np.arange(a.shape[0]))
+print("Finish runtime checking...")
+"""
 
 def test_dso_module_load():
-    if not tvm.codegen.target_enabled("llvm"):
+    if not tvm.codegen.enabled("llvm"):
         return
     dtype = 'int64'
-    temp_dir = tempfile.mkdtemp()
+    temp = testing.tempdir()
 
     def save_object(names):
         n = tvm.Var('n')
@@ -25,10 +40,10 @@ def test_dso_module_load():
         for name in names:
             m.save(name)
 
-    path_obj = "%s/test.o" % temp_dir
-    path_ll = "%s/test.ll" % temp_dir
-    path_bc = "%s/test.bc" % temp_dir
-    path_dso = "%s/test.so" % temp_dir
+    path_obj = temp.relpath("test.o")
+    path_ll = temp.relpath("test.ll")
+    path_bc = temp.relpath("test.bc")
+    path_dso = temp.relpath("test.so")
     save_object([path_obj, path_ll, path_bc])
     cc.create_shared(path_dso, [path_obj])
 
@@ -41,14 +56,14 @@ def test_dso_module_load():
     a = tvm.nd.array(np.zeros(10, dtype=dtype))
     f2(a)
     np.testing.assert_equal(a.asnumpy(), np.arange(a.shape[0]))
-    files = [path_obj, path_ll, path_bc, path_dso]
-    for f in files:
-        os.remove(f)
-    os.rmdir(temp_dir)
 
+    path_runtime_py = temp.relpath("runtime.py")
+    with open(path_runtime_py, "w") as fo:
+        fo.write(runtime_py)
 
-def test_cuda_module_load():
-    pass
+    subprocess.check_call(
+        "python %s %s %s" % (path_runtime_py, path_dso, dtype),
+        shell=True)
 
 if __name__ == "__main__":
     test_dso_module_load()
diff --git a/tests/python/unittest/test_runtime_ndarray.py b/tests/python/unittest/test_runtime_ndarray.py
index 6731c8f2394a2c794341c670fbe40626462a5815..1c3e5f70d1788ec3b3056aaab4423f9bb1cb71cf 100644
--- a/tests/python/unittest/test_runtime_ndarray.py
+++ b/tests/python/unittest/test_runtime_ndarray.py
@@ -2,9 +2,11 @@ import tvm
 import numpy as np
 
 def enabled_ctx_list():
-    tvm.init_opencl()
-    ctx_list = [tvm.cpu(0), tvm.gpu(0), tvm.opencl(0)]
-    ctx_list = [ctx for ctx in ctx_list if ctx.enabled]
+    if tvm.module.enabled("opencl"):
+        tvm.module.init_opencl()
+
+    ctx_list = [('cpu', tvm.cpu(0)), ('gpu', tvm.gpu(0)), ('cl', tvm.opencl(0))]
+    ctx_list = [x[1] for x in ctx_list if tvm.module.enabled(x[0])]
     return ctx_list
 
 ENABLED_CTX_LIST = enabled_ctx_list()