[RUNTIME] better parallel launcher and task distribution (#1026)

5ff65749 · Yida Wang · Tianqi Chen · 0ec7cabe · 5ff65749 · 5ff65749
Commit 5ff65749 authored 7 years ago by Yida Wang Committed by Tianqi Chen 7 years ago
--- a/src/runtime/thread_pool.cc
+++ b/src/runtime/thread_pool.cc
@@ -37,12 +37,11 @@ class ParallelLauncher {
            void* cdata,
            int num_task,
            bool need_sync) {
-    std::lock_guard<std::mutex> lock(mutex_);
-    num_pending_ = num_task;
+    num_pending_.store(num_task);
    this->cdata = cdata;
    this->flambda = flambda;
    this->env.num_task = num_task;
-    has_error_ = false;
+    has_error_.store(false);
    // reshape
    if (static_cast<size_t>(num_task) > par_errors_.size()) {
      par_errors_.resize(num_task + 1);
@@ -66,11 +65,10 @@ class ParallelLauncher {
  }
  // Wait n jobs to finish
  int WaitForJobs() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    cv_.wait(lock, [this] {
-        return num_pending_ == 0;
-      });
-    if (!has_error_) return 0;
+    while (num_pending_.load() != 0) {
+      tvm::runtime::threading::Yield();
+    }
+    if (!has_error_.load()) return 0;
    std::string err("");
    for (size_t i = 0; i < par_errors_.size(); ++i) {
      if (par_errors_[i].length() != 0) {
@@ -83,23 +81,13 @@ class ParallelLauncher {
  }
  // Signal that one job has finished.
  void SignalJobError(int task_id) {
-    std::unique_lock<std::mutex> lock(mutex_);
-    --num_pending_;
+    num_pending_.fetch_sub(1);
    par_errors_[task_id] = TVMGetLastError();
-    has_error_ = true;
-    if (num_pending_ == 0) {
-      lock.unlock();
-      cv_.notify_one();
-    }
+    has_error_.store(true);
  }
  // Signal that one job has finished.
  void SignalJobFinish() {
-    std::unique_lock<std::mutex> lock(mutex_);
-    --num_pending_;
-    if (num_pending_ == 0) {
-      lock.unlock();
-      cv_.notify_one();
-    }
+    num_pending_.fetch_sub(1);
  }
  // Get thread local version of the store.
  static ParallelLauncher* ThreadLocal() {
@@ -116,14 +104,10 @@ class ParallelLauncher {
  bool is_worker{false};

 private:
-  // The mutex to access local env.
-  std::mutex mutex_;
-  // The conditional variable.
-  std::condition_variable cv_;
  // The pending jobs.
-  uint32_t num_pending_;
+  std::atomic<int32_t> num_pending_;
  // Whether error has been countered.
-  bool has_error_;
+  std::atomic<bool> has_error_;
  // The counter page.
  std::atomic<int32_t>* sync_counter_{nullptr};
  // The error message
@@ -257,13 +241,13 @@ class ThreadPool {
 public:
  ThreadPool(): num_workers_(tvm::runtime::threading::MaxConcurrency()) {
    for (int i = 0; i < num_workers_; ++i) {
-      // The SpscTaskQueue only host ONE item at a time
+      // The SpscTaskQueue only hosts ONE item at a time
      queues_.emplace_back(std::unique_ptr<SpscTaskQueue>(new SpscTaskQueue()));
    }
    threads_ = std::unique_ptr<tvm::runtime::threading::ThreadGroup>(
        new tvm::runtime::threading::ThreadGroup(
          num_workers_, [this](int worker_id) { this->RunWorker(worker_id); },
-          false /* include_main_thread */));
+          exclude_worker0_ /* include_main_thread */));
  }
  ~ThreadPool() {
    for (std::unique_ptr<SpscTaskQueue>& q : queues_) {
@@ -289,10 +273,20 @@ class ThreadPool {
    launcher->Init(flambda, cdata, num_task, need_sync != 0);
    SpscTaskQueue::Task tsk;
    tsk.launcher = launcher;
-    for (int i = 0; i < num_task; ++i) {
+    // if worker0 is taken by the master, queues_[0] is abandoned
+    for (int i = exclude_worker0_; i < num_task; ++i) {
      tsk.task_id = i;
      queues_[i]->Push(tsk);
    }
+    // use the master thread to run task 0
+    if (exclude_worker0_) {
+      TVMParallelGroupEnv* penv = &(tsk.launcher->env);
+      if ((*tsk.launcher->flambda)(0, penv, cdata) == 0) {
+        tsk.launcher->SignalJobFinish();
+      } else {
+        tsk.launcher->SignalJobError(tsk.task_id);
+      }
+    }
    int res = launcher->WaitForJobs();
    return res;
  }
@@ -320,6 +314,8 @@ class ThreadPool {
    }
  }
  int num_workers_;
+  // if excluding worker 0 and using master to run task 0
+  bool exclude_worker0_{true};
  std::vector<std::unique_ptr<SpscTaskQueue> > queues_;
  std::unique_ptr<tvm::runtime::threading::ThreadGroup> threads_;
 };

--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -29,7 +29,7 @@ class ThreadGroup::Impl {
    const char *val = getenv("TVM_BIND_THREADS");
    if (val == nullptr || atoi(val) == 1) {
      if (num_workers_ <= std::thread::hardware_concurrency()) {
-        SetAffinity();
+        SetAffinity(exclude_worker0);
      } else {
        LOG(WARNING)
          << "The thread affinity cannot be set when the number of workers"
@@ -47,7 +47,9 @@ class ThreadGroup::Impl {

 private:
  // bind worker threads to disjoint cores
-  void SetAffinity() {
+  // if worker 0 is offloaded to master, i.e. exclude_worker0 is true,
+  // the master thread is bound to core 0.
+  void SetAffinity(bool exclude_worker0) {
 #if defined(__ANDROID__)
 #ifndef CPU_SET
 #define CPU_SETSIZE 1024
@@ -62,19 +64,27 @@ class ThreadGroup::Impl {
    memset((cpusetp), 0, sizeof(cpu_set_t))
 #endif
 #endif
-    for (unsigned i=0; i < threads_.size(); ++i) {
 #if defined(__linux__) || defined(__ANDROID__)
+    for (unsigned i = 0; i < threads_.size(); ++i) {
+      unsigned core_id = i + exclude_worker0;
      cpu_set_t cpuset;
      CPU_ZERO(&cpuset);
-      CPU_SET(i, &cpuset);
+      CPU_SET(core_id, &cpuset);
 #if defined(__ANDROID__)
      sched_setaffinity(threads_[i].native_handle(), sizeof(cpu_set_t), &cpuset);
 #else
      pthread_setaffinity_np(threads_[i].native_handle(),
          sizeof(cpu_set_t), &cpuset);
-#endif
 #endif
    }
+    if (exclude_worker0) {  // bind the master thread to core 0
+      cpu_set_t cpuset;
+      CPU_ZERO(&cpuset);
+      CPU_SET(0, &cpuset);
+      pthread_setaffinity_np(pthread_self(),
+        sizeof(cpu_set_t), &cpuset);
+    }
+#endif
  }

  int num_workers_;