diff --git a/python/tvm/contrib/cc_compiler.py b/python/tvm/contrib/cc_compiler.py
index af599ed54ff1eda9b9ccfc9ac44abb5809783144..da6bd661444a9b93cc3efbb86a3ee8877da64a88 100644
--- a/python/tvm/contrib/cc_compiler.py
+++ b/python/tvm/contrib/cc_compiler.py
@@ -4,13 +4,15 @@ from __future__ import absolute_import as _abs
 import sys
 import subprocess
 
-def create_shared(path_target, objects,
-                  options=None, cc="g++"):
+def create_shared(output,
+                  objects,
+                  options=None,
+                  cc="g++"):
     """Create shared library.
 
     Parameters
     ----------
-    path_target : str
+    output : str
         The target shared library.
 
     objects : list
@@ -19,19 +21,25 @@ def create_shared(path_target, objects,
     options : str
         The additional options.
 
-    cc : str
+    cc : str, optional
         The compile string.
     """
     cmd = [cc]
     cmd += ["-shared"]
+
     if sys.platform == "darwin":
         cmd += ["-undefined", "dynamic_lookup"]
-    cmd += ["-o", path_target]
-    cmd += objects
+    cmd += ["-o", output]
+
+    if isinstance(objects, str):
+        cmd += [objects]
+    else:
+        cmd += objects
+
     if options:
         cmd += options
-    args = ' '.join(cmd)
 
+    args = ' '.join(cmd)
     proc = subprocess.Popen(
         args, shell=True,
         stdout=subprocess.PIPE,
@@ -39,6 +47,6 @@ def create_shared(path_target, objects,
     (out, _) = proc.communicate()
 
     if proc.returncode != 0:
-        sys.stderr.write("Compilation error:\n")
-        sys.stderr.write(out)
-        sys.stderr.flush()
+        msg = "Compilation error:\n"
+        msg += out
+        raise RuntimeError(msg)
diff --git a/python/tvm/contrib/rpc.py b/python/tvm/contrib/rpc.py
index 1207f2a8d5c3522f5404694fd6840e21c97f8050..627b868a656d6208aebef4042fe646099067374f 100644
--- a/python/tvm/contrib/rpc.py
+++ b/python/tvm/contrib/rpc.py
@@ -15,7 +15,7 @@ import socket
 import struct
 import logging
 import multiprocessing
-from . import util
+from . import util, cc_compiler
 from ..module import load as _load_module
 from .._ffi.function import _init_api, register_func
 from .._ffi.ndarray import context as _context
@@ -34,19 +34,28 @@ def _serve_loop(sock, addr):
         path = temp.relpath(file_name)
         with open(path, "wb") as out_file:
             out_file.write(blob)
+        logging.info("upload %s", path)
 
     @register_func("tvm.contrib.rpc.server.download")
     def download(file_name):
         """Download file from remote"""
         path = temp.relpath(file_name)
         dat = bytearray(open(path, "rb").read())
+        logging.info("download %s", path)
         return dat
 
     @register_func("tvm.contrib.rpc.server.load_module")
     def load_module(file_name):
         """Load module from remote side."""
         path = temp.relpath(file_name)
+        # Try create a shared library in remote
+        if path.endswith('.o'):
+            logging.info('Create shared library based on %s', path)
+            cc_compiler.create_shared(path + '.so', path)
+            path += '.so'
+
         m = _load_module(path)
+        logging.info("load_module %s", path)
         return m
 
     _ServerLoop(sockfd)
diff --git a/src/codegen/codegen.cc b/src/codegen/codegen.cc
index c83d3bd0f92711efc30f20cb4ddf0ccd7bd549fe..3167165068638dcde7b44f400b37977c6cf9015c 100644
--- a/src/codegen/codegen.cc
+++ b/src/codegen/codegen.cc
@@ -17,7 +17,7 @@ namespace codegen {
 runtime::Module Build(const Array<LoweredFunc>& funcs,
                       const std::string& target) {
   std::string mode = target;
-  size_t pos = mode.find("-");
+  size_t pos = mode.find(' ');
   if (pos != std::string::npos) {
     mode = mode.substr(0, pos);
   }
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index 0515b6bd13de188aa9b210d3a7e3bb7a7df5c78f..28783174d1617b9d49a27e030502dba87b903e1c 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -14,7 +14,7 @@ namespace tvm {
 namespace codegen {
 
 void CodeGenLLVM::Init(const std::string& module_name,
-                       const std::string& target_triple,
+                       llvm::TargetMachine* tm,
                        llvm::LLVMContext* ctx) {
   InitializeLLVM();
   static_assert(sizeof(TVMValue) == sizeof(double), "invariant");
@@ -81,17 +81,14 @@ void CodeGenLLVM::Init(const std::string& module_name,
           t_int64_, t_int64_, t_f_tvm_par_for_lambda_->getPointerTo(), t_void_p_}
         , false),
       llvm::Function::ExternalLinkage, "TVMBackendParallelFor", module_.get());
-  this->InitTarget(target_triple);
+  this->InitTarget(tm);
   // initialize builder
   builder_.reset(new IRBuilder(*ctx));
   this->InitGlobalContext();
 }
 
-void CodeGenLLVM::InitTarget(const std::string& target) {
-  llvm::TargetMachine* tm;
-  std::string target_triple;
-  std::tie(tm, target_triple) = GetLLVMTarget(target);
-  module_->setTargetTriple(target_triple);
+void CodeGenLLVM::InitTarget(llvm::TargetMachine* tm) {
+  module_->setTargetTriple(tm->getTargetTriple().str());
   module_->setDataLayout(tm->createDataLayout());
   data_layout_.reset(new llvm::DataLayout(module_.get()));
 }
diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h
index b361eabfebd2ce1cc6171d51c25a56f524695fe4..ac1241c3b73f73d7c1bbdc19246242e2d06d6d13 100644
--- a/src/codegen/llvm/codegen_llvm.h
+++ b/src/codegen/llvm/codegen_llvm.h
@@ -31,11 +31,11 @@ class CodeGenLLVM :
   /*!
    * \brief Initialize the code generator with given context
    * \param module_name The name of the module.
-   * \param target_triple The target triple, can be empty.
+   * \param tm Target machine model
    * \param ctx The context.
    */
   void Init(const std::string& module_name,
-            const std::string& target_triple,
+            llvm::TargetMachine* tm,
             llvm::LLVMContext* ctx);
   /*!
    * \brief Compile and add function f to the current module.
@@ -208,7 +208,7 @@ class CodeGenLLVM :
   // return the end block after the check
   llvm::BasicBlock* CheckCallSuccess(llvm::Value* retcode);
   // Initialize target
-  void InitTarget(const std::string& target);
+  void InitTarget(llvm::TargetMachine* tm);
   // Add a function to set global module context
   void InitGlobalContext();
   // add alias information.
diff --git a/src/codegen/llvm/llvm_common.cc b/src/codegen/llvm/llvm_common.cc
index 5d504fdcb65562f5b736d7c476b905709512801d..0d08622b6659138b7f465ed1fe14f9f81c7d08b8 100644
--- a/src/codegen/llvm/llvm_common.cc
+++ b/src/codegen/llvm/llvm_common.cc
@@ -36,32 +36,55 @@ void InitializeLLVM() {
   }
 }
 
-std::pair<llvm::TargetMachine*, std::string>
-GetLLVMTarget(const std::string& target_str) {
+llvm::TargetMachine*
+GetLLVMTargetMachine(const std::string& target_str) {
   // setup target triple
-  std::string target_triple;
-  CHECK_EQ(target_str.substr(0, 4), "llvm");
-  if (target_str.length() > 4) {
-    target_triple = target_str.substr(5, target_str.length() - 5);
-  } else {
-    target_triple = "";
+  CHECK(target_str.length() >= 4 &&
+        target_str.substr(0, 4) == "llvm")
+      << "llvm target must starts with llvm";
+  // simple parser
+  std::string target_triple = "";
+  std::string cpu = "generic";
+  std::string features = "";
+  std::string key, value;
+  if (target_str.length() > 5) {
+    std::istringstream is(target_str.substr(5, target_str.length() - 5));
+    while (is >> key) {
+      size_t pos = key.find('=');
+      if (pos != std::string::npos) {
+        CHECK_GE(key.length(), pos + 1)
+            << "inavlid argument " << key;
+        value = key.substr(pos + 1, key.length() - 1);
+        key = key.substr(0, pos);
+      } else {
+        CHECK(is >> value)
+            << "Unspecified value for option " << key;
+      }
+      if (key == "-target" ||
+          key == "-mtriple") {
+        target_triple = value;
+      } else if (key == "-mcpu") {
+        cpu = value;
+      } else if (key == "-features") {
+        features = value;
+      } else {
+        LOG(FATAL) << "unknown option " << key;
+      }
+    }
   }
   if (target_triple.length() == 0 ||
       target_triple == "default") {
     target_triple = llvm::sys::getDefaultTargetTriple();
   }
-
   std::string err;
   const llvm::Target* target =
       llvm::TargetRegistry::lookupTarget(target_triple, err);
   CHECK(target) << err << " target_triple=" << target_triple;
-  std::string cpu = "generic";
-  std::string features = "";
   llvm::TargetOptions opt;
   auto rmodel = llvm::Reloc::PIC_;
   llvm::TargetMachine* tm =
       target->createTargetMachine(target_triple, cpu, features, opt, rmodel);
-  return {tm, target_triple};
+  return tm;
 }
 
 }  // namespace codegen
diff --git a/src/codegen/llvm/llvm_common.h b/src/codegen/llvm/llvm_common.h
index 9b5ef96fa11f59a832bfe834abeed4acf2cb2252..353f8af39a60fb1088f0d25d74655b9c1d3823e1 100644
--- a/src/codegen/llvm/llvm_common.h
+++ b/src/codegen/llvm/llvm_common.h
@@ -51,11 +51,11 @@ void InitializeLLVM();
 
 /*!
  * \brief Get target machine from target_str string.
- * \param target_str Target triple string, can have llvm- prefix, can be empty.
- * \return Pair of target machine and target triple.
+ * \param target_str Target string, in format "llvm -target=xxx -mcpu=xxx"
+ * \return target machine
  */
-std::pair<llvm::TargetMachine*, std::string>
-GetLLVMTarget(const std::string& target_str);
+llvm::TargetMachine*
+GetLLVMTargetMachine(const std::string& target_str);
 
 }  // namespace codegen
 }  // namespace tvm
diff --git a/src/codegen/llvm/llvm_module.cc b/src/codegen/llvm/llvm_module.cc
index 9d18ff3198ac8fd08c2d262c2c3226306b075a55..24ca48283ae5ce34a04f267ad27a93381b178760 100644
--- a/src/codegen/llvm/llvm_module.cc
+++ b/src/codegen/llvm/llvm_module.cc
@@ -98,11 +98,12 @@ class LLVMModuleNode final : public runtime::ModuleNode {
 
   void Init(const Array<LoweredFunc>& funcs, std::string target) {
     InitializeLLVM();
-    std::tie(tm_, target_triple_) = GetLLVMTarget(target);
+    tm_ = GetLLVMTargetMachine(target);
+    target_ = target;
     CHECK_NE(funcs.size(), 0U);
     ctx_ = std::make_shared<llvm::LLVMContext>();
     CodeGenLLVM cg;
-    cg.Init(funcs[0]->name, target, ctx_.get());
+    cg.Init(funcs[0]->name, tm_, ctx_.get());
     for (LoweredFunc f :  funcs) {
       cg.AddFunction(f);
     }
@@ -115,11 +116,16 @@ class LLVMModuleNode final : public runtime::ModuleNode {
   void LazyInitJIT() {
     CHECK(ee_ == nullptr);
     std::lock_guard<std::mutex> lock(mutex_);
-    std::string target_triple = mptr_->getTargetTriple();
     llvm::EngineBuilder builder(std::move(module_));
     builder.setEngineKind(llvm::EngineKind::JIT);
     builder.setOptLevel(llvm::CodeGenOpt::Aggressive);
     llvm::TargetMachine *tm = builder.selectTarget();
+    llvm::TargetMachine *tm_sys = GetLLVMTargetMachine("llvm");
+    if (tm_sys->getTargetTriple().getArch() != tm->getTargetTriple().getArch()) {
+      LOG(FATAL) << "Cannot run module, architecture mismatch "
+                 << " module=" << tm->getTargetTriple().str()
+                 << " system=" << tm_sys->getTargetTriple().str();
+    }
     llvm::DataLayout layout(tm->createDataLayout());
     CHECK(layout == mptr_->getDataLayout())
         << "Data layout mismatch between module("
@@ -127,8 +133,9 @@ class LLVMModuleNode final : public runtime::ModuleNode {
         << " and ExecutionEngine ("
         << layout.getStringRepresentation() << ")";
     ee_ = builder.create(tm);
+
     CHECK(ee_ != nullptr)
-        << "Failed to initialize git engine for " << target_triple;
+        << "Failed to initialize git engine for " << mptr_->getTargetTriple();
     ee_->runStaticConstructorsDestructors(false);
     // setup context address.
     void** ctx_addr =
@@ -139,7 +146,7 @@ class LLVMModuleNode final : public runtime::ModuleNode {
     }
   }
   // The target configuration string
-  std::string target_triple_;
+  std::string target_;
   // JIT lock
   std::mutex mutex_;
   // execution engine
diff --git a/tests/python/unittest/test_codegen_cross_llvm.py b/tests/python/unittest/test_codegen_cross_llvm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a69e215476adf8a24c0dcd55ebccc1c177cfc2eb
--- /dev/null
+++ b/tests/python/unittest/test_codegen_cross_llvm.py
@@ -0,0 +1,69 @@
+"""Test cross compilation"""
+import tvm
+import os
+import struct
+from tvm.contrib import util, cc_compiler as cc, rpc
+import numpy as np
+
+def test_llvm_add_pipeline():
+    nn = 1024
+    n = tvm.convert(nn)
+    A = tvm.placeholder((n,), name='A')
+    B = tvm.placeholder((n,), name='B')
+    C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
+    s = tvm.create_schedule(C.op)
+    xo, xi = s[C].split(C.op.axis[0], factor=4)
+    s[C].parallel(xo)
+    s[C].vectorize(xi)
+
+    def verify_elf(path, e_machine):
+        with open(path, "rb") as fi:
+            arr = fi.read(20)
+            assert struct.unpack('ccc', arr[1:4]) == (b'E',b'L',b'F')
+            endian = struct.unpack('b', arr[0x5:0x6])[0]
+            endian = '<' if endian == 1 else '>'
+            assert struct.unpack(endian + 'h', arr[0x12:0x14])[0] == e_machine
+
+    def build_i386():
+        temp = util.tempdir()
+        target = "llvm -target=i386-pc-linux-gnu"
+        f = tvm.build(s, [A, B, C], target)
+        path = temp.relpath("myadd.o")
+        f.save(path)
+        verify_elf(path, 0x03)
+
+    def build_arm():
+        temp = util.tempdir()
+        target = "llvm -target=arm-none-linux-gnueabihf"
+        f = tvm.build(s, [A, B, C], target)
+        path = temp.relpath("myadd.o")
+        f.save(path)
+        verify_elf(path, 0x28)
+        # Do a RPC verification, launch kernel on Arm Board if available.
+        host = os.environ.get('TVM_RPC_ARM_HOST', None)
+        remote = None
+        if host:
+            port = int(os.environ['TVM_RPC_ARM_PORT'])
+            try:
+                remote = rpc.connect(host, port)
+            except tvm.TVMError as e:
+                pass
+
+        if remote:
+            remote.upload(path)
+            farm = remote.load_module("myadd.o")
+            ctx = remote.cpu(0)
+            n = nn
+            a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+            b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
+            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
+            farm(a, b, c)
+            np.testing.assert_allclose(
+                c.asnumpy(), a.asnumpy() + b.asnumpy())
+            print("Verification finish on remote..")
+
+    build_i386()
+    build_arm()
+
+if __name__ == "__main__":
+    test_llvm_add_pipeline()