Merge pull request #1238 from trapexit/boundedqueue

Change threadpool to use concurrentqueue
2 years ago · 69de4de53f
20 changed files with 9772 additions and 492 deletions
--- a/1
+++ b/1
@ -7,3 +7,4 @@
 * ghc::filesystem: https://github.com/gulrak/filesystem
 * nonstd::optional: https://github.com/martinmoene/optional-lite
 * fmt: https://github.com/fmtlib/fmt
+* concurrentqueue: https://github.com/cameron314/concurrentqueue
--- a/libfuse/lib/bounded_queue.hpp
+++ b/libfuse/lib/bounded_queue.hpp
@ -1,166 +0,0 @@
-#pragma once
-
-#include <condition_variable>
-#include <mutex>
-#include <queue>
-#include <utility>
-
-
-template<typename T>
-class BoundedQueue
-{
-public:
-  explicit
-  BoundedQueue(std::size_t max_size_,
-               bool        block_ = true)
-    : _block(block_),
-      _max_size(max_size_ ? max_size_ : 1)
-  {
-  }
-
-  BoundedQueue(const BoundedQueue&) = delete;
-  BoundedQueue(BoundedQueue&&) = default;
-
-  bool
-  push(const T& item_)
-  {
-    {
-      std::unique_lock<std::mutex> guard(_queue_lock);
-
-      _condition_push.wait(guard, [&]() { return _queue.size() < _max_size || !_block; });
-
-      if(_queue.size() == _max_size)
-        return false;
-
-      _queue.push(item_);
-    }
-
-    _condition_pop.notify_one();
-
-    return true;
-  }
-
-  bool
-  push(T&& item_)
-  {
-    {
-      std::unique_lock<std::mutex> guard(_queue_lock);
-
-      _condition_push.wait(guard, [&]() { return _queue.size() < _max_size || !_block; });
-
-      if(_queue.size() == _max_size)
-        return false;
-
-      _queue.push(std::move(item_));
-    }
-    _condition_pop.notify_one();
-    return true;
-  }
-
-  template<typename... Args>
-  bool
-  emplace(Args&&... args_)
-  {
-    {
-      std::lock_guard<std::mutex> guard(_queue_lock);
-
-      _condition_push.wait(guard, [&]() { return _queue.size() < _max_size || !_block; });
-
-      if(_queue.size() == _max_size)
-        return false;
-
-      _queue.emplace(std::forward<Args>(args_)...);
-    }
-
-    _condition_pop.notify_one();
-
-    return true;
-  }
-
-  bool
-  pop(T& item_)
-  {
-    {
-      std::unique_lock<std::mutex> guard(_queue_lock);
-
-      _condition_pop.wait(guard, [&]() { return !_queue.empty() || !_block; });
-      if(_queue.empty())
-        return false;
-
-      item_ = std::move(_queue.front());
-
-      _queue.pop();
-    }
-
-    _condition_push.notify_one();
-
-    return true;
-  }
-
-  std::size_t
-  size() const
-  {
-    std::lock_guard<std::mutex> guard(_queue_lock);
-
-    return _queue.size();
-  }
-
-  std::size_t
-  capacity() const
-  {
-    return _max_size;
-  }
-
-  bool
-  empty() const
-  {
-    std::lock_guard<std::mutex> guard(_queue_lock);
-
-    return _queue.empty();
-  }
-
-  bool
-  full() const
-  {
-    std::lock_guard<std::mutex> lock(_queue_lock);
-
-    return (_queue.size() == capacity());
-  }
-
-  void
-  block()
-  {
-    std::lock_guard<std::mutex> guard(_queue_lock);
-    _block = true;
-  }
-
-  void
-  unblock()
-  {
-    {
-      std::lock_guard<std::mutex> guard(_queue_lock);
-      _block = false;
-    }
-
-    _condition_push.notify_all();
-    _condition_pop.notify_all();
-  }
-
-  bool
-  blocking() const
-  {
-    std::lock_guard<std::mutex> guard(_queue_lock);
-
-    return _block;
-  }
-
-private:
-  mutable std::mutex _queue_lock;
-
-private:
-  bool _block;
-  std::queue<T> _queue;
-  const std::size_t _max_size;
-  std::condition_variable _condition_push;
-  std::condition_variable _condition_pop;
-};
--- a/libfuse/lib/bounded_thread_pool.hpp
+++ b/libfuse/lib/bounded_thread_pool.hpp
@ -1,57 +1,44 @@
 #pragma once

-#include "bounded_queue.hpp"
-#include "make_unique.hpp"
+#include "moodycamel/blockingconcurrentqueue.h"
+#include "syslog.h"

 #include <atomic>
 #include <csignal>
-#include <functional>
+#include <cstring>
 #include <future>
 #include <memory>
+#include <stdexcept>
 #include <string>
 #include <thread>
-#include <tuple>
-#include <type_traits>
-#include <utility>
 #include <vector>


+struct BoundedThreadPoolTraits : public moodycamel::ConcurrentQueueDefaultTraits
+{
+  static const int MAX_SEMA_SPINS = 1;
+};
+
+
 class BoundedThreadPool
 {
 private:
-  using Proc   = std::function<void(void)>;
-  using Queue  = BoundedQueue<Proc>;
-  using Queues = std::vector<std::unique_ptr<Queue>>;
+  using Func  = std::function<void(void)>;
+  using Queue = moodycamel::BlockingConcurrentQueue<Func,BoundedThreadPoolTraits>;

 public:
  explicit
  BoundedThreadPool(std::size_t const thread_count_ = std::thread::hardware_concurrency(),
                    std::size_t const queue_depth_  = 1,
                    std::string const name_         = {})
-    : _queues(),
-      _count(thread_count_)
+    : _queue(queue_depth_,thread_count_,thread_count_),
+      _done(false),
+      _name(name_)
  {
-    for(std::size_t i = 0; i < thread_count_; i++)
-      _queues.emplace_back(std::make_unique<Queue>(queue_depth_));
-
-    auto worker = [this](std::size_t i)
-    {
-      while(true)
-        {
-          Proc f;
-
-          for(std::size_t n = 0; n < (_count * K); ++n)
-            {
-              if(_queues[(i + n) % _count]->pop(f))
-                break;
-            }
-
-          if(!f && !_queues[i]->pop(f))
-            break;
-
-          f();
-        }
-    };
+    syslog_debug("threadpool: spawning %zu threads of queue depth %zu named '%s'",
+                 thread_count_,
+                 queue_depth_,
+                 _name.c_str());

    sigset_t oldset;
    sigset_t newset;
@ -61,50 +48,109 @@ public:

    _threads.reserve(thread_count_);
    for(std::size_t i = 0; i < thread_count_; ++i)
-      _threads.emplace_back(worker, i);
-    if(!name_.empty())
      {
-        for(auto &t : _threads)
-          pthread_setname_np(t.native_handle(),name_.c_str());
+        int rv;
+        pthread_t t;
+
+        rv = pthread_create(&t,NULL,BoundedThreadPool::start_routine,this);
+        if(rv != 0)
+          {
+            syslog_warning("threadpool: error spawning thread - %d (%s)",
+                           rv,
+                           strerror(rv));
+            continue;
+          }
+
+        if(!_name.empty())
+          pthread_setname_np(t,_name.c_str());
+
+        _threads.push_back(t);
      }

    pthread_sigmask(SIG_SETMASK,&oldset,NULL);
+
+    if(_threads.empty())
+      throw std::runtime_error("threadpool: failed to spawn any threads");
  }

  ~BoundedThreadPool()
  {
-    for(auto &queue : _queues)
-      queue->unblock();
-    for(auto &thread : _threads)
-      pthread_cancel(thread.native_handle());
-    for(auto &thread : _threads)
-      thread.join();
+    syslog_debug("threadpool: destroying %zu threads named '%s'",
+                 _threads.size(),
+                 _name.c_str());
+
+    _done.store(true,std::memory_order_relaxed);
+
+    for(auto t : _threads)
+      pthread_cancel(t);
+
+    Func f;
+    while(_queue.try_dequeue(f))
+      continue;
+
+    for(auto t : _threads)
+      pthread_join(t,NULL);
  }

-  template<typename F>
-  void
-  enqueue_work(F&& f_)
+private:
+  static
+  void*
+  start_routine(void *arg_)
  {
-    auto i = _index.fetch_add(1,std::memory_order_relaxed);
+    BoundedThreadPool *btp = static_cast<BoundedThreadPool*>(arg_);
+    BoundedThreadPool::Func func;
+    std::atomic<bool> &done = btp->_done;
+    BoundedThreadPool::Queue &q = btp->_queue;
+    moodycamel::ConsumerToken ctok(btp->_queue);
+
+    while(!done.load(std::memory_order_relaxed))
+      {
+        q.wait_dequeue(ctok,func);
+
+        func();
+      }
+
+    return NULL;
+  }

-    for(std::size_t n = 0; n < (_count * K); ++n)
+public:
+  template<typename FuncType>
+  void
+  enqueue_work(moodycamel::ProducerToken  &ptok_,
+               FuncType                  &&f_)
+  {
+    timespec ts = {0,10};
+    while(true)
      {
-        if(_queues[(i + n) % _count]->push(f_))
+        if(_queue.try_enqueue(ptok_,f_))
          return;
+        ::nanosleep(&ts,NULL);
+        ts.tv_nsec += 10;
      }
+  }

-    _queues[i % _count]->push(std::move(f_));
+  template<typename FuncType>
+  void
+  enqueue_work(FuncType &&f_)
+  {
+    timespec ts = {0,10};
+    while(true)
+      {
+        if(_queue.try_enqueue(f_))
+          return;
+        ::nanosleep(&ts,NULL);
+        ts.tv_nsec += 10;
+      }
  }

-  template<typename F>
+  template<typename FuncType>
  [[nodiscard]]
-  std::future<typename std::result_of<F()>::type>
-  enqueue_task(F&& f_)
+  std::future<typename std::result_of<FuncType()>::type>
+  enqueue_task(FuncType&& f_)
  {
-    using TaskReturnType = typename std::result_of<F()>::type;
+    using TaskReturnType = typename std::result_of<FuncType()>::type;
    using Promise        = std::promise<TaskReturnType>;

-    auto i       = _index.fetch_add(1,std::memory_order_relaxed);
    auto promise = std::make_shared<Promise>();
    auto future  = promise->get_future();
    auto work    = [=]()
@ -113,38 +159,37 @@ public:
      promise->set_value(rv);
    };

-    for(std::size_t n = 0; n < (_count * K); ++n)
+    timespec ts = {0,10};
+    while(true)
      {
-        if(_queues[(i + n) % _count]->push(work))
-          return future;
+        if(_queue.try_enqueue(work))
+          break;
+        ::nanosleep(&ts,NULL);
+        ts.tv_nsec += 10;
      }

-    _queues[i % _count]->push(std::move(work));
-
    return future;
  }

 public:
  std::vector<pthread_t>
-  threads()
+  threads() const
  {
-    std::vector<pthread_t> rv;
-
-    for(auto &thread : _threads)
-      rv.push_back(thread.native_handle());
-
-    return rv;
+    return _threads;
  }

-private:
-  Queues _queues;
+public:
+  Queue&
+  queue()
+  {
+    return _queue;
+  }

 private:
-  std::vector<std::thread> _threads;
+  Queue _queue;

 private:
-  const std::size_t _count;
-  std::atomic_uint _index;
-
-  static const unsigned int K = 2;
+  std::atomic<bool>      _done;
+  std::string const      _name;
+  std::vector<pthread_t> _threads;
 };
--- a/libfuse/lib/fuse_loop.cpp
+++ b/libfuse/lib/fuse_loop.cpp
@ -5,6 +5,7 @@
 #include "bounded_thread_pool.hpp"
 #include "cpu.hpp"
 #include "fmt/core.h"
+#include "make_unique.hpp"
 #include "scope_guard.hpp"
 #include "syslog.h"

@ -87,6 +88,7 @@ struct AsyncWorker
    DEFER{ fuse_session_exit(_se); };
    DEFER{ sem_post(_finished); };

+    moodycamel::ProducerToken ptok(_process_tp->queue());
    while(!fuse_session_exited(_se))
      {
        int rv;
@ -107,10 +109,13 @@ struct AsyncWorker
              return handle_receive_error(rv,msgbuf);
          } while(false);

-        _process_tp->enqueue_work([=] {
+        auto const func = [=]
+        {
          _se->process_buf(_se,msgbuf);
          msgbuf_free(msgbuf);
-        });
+        };
+
+        _process_tp->enqueue_work(ptok,func);
      }
  }
 };
--- a/libfuse/lib/moodycamel/blockingconcurrentqueue.h
+++ b/libfuse/lib/moodycamel/blockingconcurrentqueue.h
@ -0,0 +1,582 @@
+// Provides an efficient blocking version of moodycamel::ConcurrentQueue.
+// ©2015-2020 Cameron Desrochers. Distributed under the terms of the simplified
+// BSD license, available at the top of concurrentqueue.h.
+// Also dual-licensed under the Boost Software License (see LICENSE.md)
+// Uses Jeff Preshing's semaphore implementation (under the terms of its
+// separate zlib license, see lightweightsemaphore.h).
+
+#pragma once
+
+#include "concurrentqueue.h"
+#include "lightweightsemaphore.h"
+
+#include <type_traits>
+#include <cerrno>
+#include <memory>
+#include <chrono>
+#include <ctime>
+
+namespace moodycamel
+{
+// This is a blocking version of the queue. It has an almost identical interface to
+// the normal non-blocking version, with the addition of various wait_dequeue() methods
+// and the removal of producer-specific dequeue methods.
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class BlockingConcurrentQueue
+{
+private:
+	typedef ::moodycamel::ConcurrentQueue<T, Traits> ConcurrentQueue;
+	typedef ::moodycamel::LightweightSemaphore LightweightSemaphore;
+
+public:
+	typedef typename ConcurrentQueue::producer_token_t producer_token_t;
+	typedef typename ConcurrentQueue::consumer_token_t consumer_token_t;
+	
+	typedef typename ConcurrentQueue::index_t index_t;
+	typedef typename ConcurrentQueue::size_t size_t;
+	typedef typename std::make_signed<size_t>::type ssize_t;
+	
+	static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE;
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD;
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE;
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE;
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE;
+	static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE;
+	
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit BlockingConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: inner(capacity), sema(create<LightweightSemaphore, ssize_t, int>(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+	{
+		assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+		if (!sema) {
+			MOODYCAMEL_THROW(std::bad_alloc());
+		}
+	}
+	
+	BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: inner(minCapacity, maxExplicitProducers, maxImplicitProducers), sema(create<LightweightSemaphore, ssize_t, int>(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+	{
+		assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+		if (!sema) {
+			MOODYCAMEL_THROW(std::bad_alloc());
+		}
+	}
+	
+	// Disable copying and copy assignment
+	BlockingConcurrentQueue(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: inner(std::move(other.inner)), sema(std::move(other.sema))
+	{ }
+	
+	inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		inner.swap(other.inner);
+		sema.swap(other.sema);
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		if ((details::likely)(inner.enqueue(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		if ((details::likely)(inner.enqueue(std::move(item)))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		if ((details::likely)(inner.enqueue(token, item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		if ((details::likely)(inner.enqueue(token, std::move(item)))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		if ((details::likely)(inner.enqueue_bulk(std::forward<It>(itemFirst), count))) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		if ((details::likely)(inner.enqueue_bulk(token, std::forward<It>(itemFirst), count))) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		if (inner.try_enqueue(item)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		if (inner.try_enqueue(std::move(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		if (inner.try_enqueue(token, item)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		if (inner.try_enqueue(token, std::move(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (inner.try_enqueue_bulk(std::forward<It>(itemFirst), count)) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		if (inner.try_enqueue_bulk(token, std::forward<It>(itemFirst), count)) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue(U& item)
+	{
+		if (sema->tryWait()) {
+			while (!inner.try_dequeue(item)) {
+				continue;
+			}
+			return true;
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		if (sema->tryWait()) {
+			while (!inner.try_dequeue(token, item)) {
+				continue;
+			}
+			return true;
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	
+	
+	// Blocks the current thread until there's something to dequeue, then
+	// dequeues it.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline void wait_dequeue(U& item)
+	{
+		while (!sema->wait()) {
+			continue;
+		}
+		while (!inner.try_dequeue(item)) {
+			continue;
+		}
+	}
+
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout (specified in microseconds) expires. Returns false
+	// without setting `item` if the timeout expires, otherwise assigns
+	// to `item` and returns true.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs)
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		while (!inner.try_dequeue(item)) {
+			continue;
+		}
+		return true;
+	}
+    
+    // Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+	// Never allocates. Thread-safe.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Blocks the current thread until there's something to dequeue, then
+	// dequeues it using an explicit consumer token.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline void wait_dequeue(consumer_token_t& token, U& item)
+	{
+		while (!sema->wait()) {
+			continue;
+		}
+		while (!inner.try_dequeue(token, item)) {
+			continue;
+		}
+	}
+	
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout (specified in microseconds) expires. Returns false
+	// without setting `item` if the timeout expires, otherwise assigns
+	// to `item` and returns true.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::int64_t timeout_usecs)
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		while (!inner.try_dequeue(token, item)) {
+			continue;
+		}
+		return true;
+	}
+    
+    // Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+	// Never allocates. Thread-safe.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(token, item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which will
+	// always be at least one (this method blocks until the queue
+	// is non-empty) and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue_bulk.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::int64_t timeout_usecs)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+    
+    // Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It, typename Rep, typename Period>
+	inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which will
+	// always be at least one (this method blocks until the queue
+	// is non-empty) and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue_bulk.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::int64_t timeout_usecs)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It, typename Rep, typename Period>
+	inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(token, itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	inline size_t size_approx() const
+	{
+		return (size_t)sema->availableApprox();
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static constexpr bool is_lock_free()
+	{
+		return ConcurrentQueue::is_lock_free();
+	}
+	
+
+private:
+	template<typename U, typename A1, typename A2>
+	static inline U* create(A1&& a1, A2&& a2)
+	{
+		void* p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1), std::forward<A2>(a2)) : nullptr;
+	}
+	
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr) {
+			p->~U();
+		}
+		(Traits::free)(p);
+	}
+	
+private:
+	ConcurrentQueue inner;
+	std::unique_ptr<LightweightSemaphore, void (*)(LightweightSemaphore*)> sema;
+};
+
+
+template<typename T, typename Traits>
+inline void swap(BlockingConcurrentQueue<T, Traits>& a, BlockingConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}	// end namespace moodycamel
--- a/libfuse/lib/moodycamel/concurrentqueue.h
+++ b/libfuse/lib/moodycamel/concurrentqueue.h
--- a/libfuse/lib/moodycamel/lightweightsemaphore.h
+++ b/libfuse/lib/moodycamel/lightweightsemaphore.h
@ -0,0 +1,427 @@
+// Provides an efficient implementation of a semaphore (LightweightSemaphore).
+// This is an extension of Jeff Preshing's sempahore implementation (licensed 
+// under the terms of its separate zlib license) that has been adapted and
+// extended by Cameron Desrochers.
+
+#pragma once
+
+#include <cstddef> // For std::size_t
+#include <atomic>
+#include <type_traits> // For std::make_signed<T>
+
+#if defined(_WIN32)
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+	struct _SECURITY_ATTRIBUTES;
+	__declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
+	__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+	__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
+	__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
+}
+#elif defined(__MACH__)
+#include <mach/mach.h>
+#elif defined(__MVS__)
+#include <zos-semaphore.h>
+#elif defined(__unix__)
+#include <semaphore.h>
+
+#if defined(__GLIBC_PREREQ) && defined(_GNU_SOURCE)
+#if __GLIBC_PREREQ(2,30)
+#define MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+#endif
+#endif
+#endif
+
+namespace moodycamel
+{
+namespace details
+{
+
+// Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's
+// portable + lightweight semaphore implementations, originally from
+// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
+// LICENSE:
+// Copyright (c) 2015 Jeff Preshing
+//
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+//	claim that you wrote the original software. If you use this software
+//	in a product, an acknowledgement in the product documentation would be
+//	appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//	misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+#if defined(_WIN32)
+class Semaphore
+{
+private:
+	void* m_hSema;
+	
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		const long maxLong = 0x7fffffff;
+		m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
+		assert(m_hSema);
+	}
+
+	~Semaphore()
+	{
+		CloseHandle(m_hSema);
+	}
+
+	bool wait()
+	{
+		const unsigned long infinite = 0xffffffff;
+		return WaitForSingleObject(m_hSema, infinite) == 0;
+	}
+	
+	bool try_wait()
+	{
+		return WaitForSingleObject(m_hSema, 0) == 0;
+	}
+	
+	bool timed_wait(std::uint64_t usecs)
+	{
+		return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) == 0;
+	}
+
+	void signal(int count = 1)
+	{
+		while (!ReleaseSemaphore(m_hSema, count, nullptr));
+	}
+};
+#elif defined(__MACH__)
+//---------------------------------------------------------
+// Semaphore (Apple iOS and OSX)
+// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+	semaphore_t m_sema;
+
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		kern_return_t rc = semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
+		assert(rc == KERN_SUCCESS);
+		(void)rc;
+	}
+
+	~Semaphore()
+	{
+		semaphore_destroy(mach_task_self(), m_sema);
+	}
+
+	bool wait()
+	{
+		return semaphore_wait(m_sema) == KERN_SUCCESS;
+	}
+	
+	bool try_wait()
+	{
+		return timed_wait(0);
+	}
+	
+	bool timed_wait(std::uint64_t timeout_usecs)
+	{
+		mach_timespec_t ts;
+		ts.tv_sec = static_cast<unsigned int>(timeout_usecs / 1000000);
+		ts.tv_nsec = static_cast<int>((timeout_usecs % 1000000) * 1000);
+
+		// added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
+		kern_return_t rc = semaphore_timedwait(m_sema, ts);
+		return rc == KERN_SUCCESS;
+	}
+
+	void signal()
+	{
+		while (semaphore_signal(m_sema) != KERN_SUCCESS);
+	}
+
+	void signal(int count)
+	{
+		while (count-- > 0)
+		{
+			while (semaphore_signal(m_sema) != KERN_SUCCESS);
+		}
+	}
+};
+#elif defined(__unix__) || defined(__MVS__)
+//---------------------------------------------------------
+// Semaphore (POSIX, Linux, zOS)
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+	sem_t m_sema;
+
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		int rc = sem_init(&m_sema, 0, static_cast<unsigned int>(initialCount));
+		assert(rc == 0);
+		(void)rc;
+	}
+
+	~Semaphore()
+	{
+		sem_destroy(&m_sema);
+	}
+
+	bool wait()
+	{
+		// http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+		int rc;
+		do {
+			rc = sem_wait(&m_sema);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	bool try_wait()
+	{
+		int rc;
+		do {
+			rc = sem_trywait(&m_sema);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	bool timed_wait(std::uint64_t usecs)
+	{
+		struct timespec ts;
+		const int usecs_in_1_sec = 1000000;
+		const int nsecs_in_1_sec = 1000000000;
+#ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+		clock_gettime(CLOCK_MONOTONIC, &ts);
+#else
+		clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+		ts.tv_sec += (time_t)(usecs / usecs_in_1_sec);
+		ts.tv_nsec += (long)(usecs % usecs_in_1_sec) * 1000;
+		// sem_timedwait bombs if you have more than 1e9 in tv_nsec
+		// so we have to clean things up before passing it in
+		if (ts.tv_nsec >= nsecs_in_1_sec) {
+			ts.tv_nsec -= nsecs_in_1_sec;
+			++ts.tv_sec;
+		}
+
+		int rc;
+		do {
+#ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+			rc = sem_clockwait(&m_sema, CLOCK_MONOTONIC, &ts);
+#else
+			rc = sem_timedwait(&m_sema, &ts);
+#endif
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	void signal()
+	{
+		while (sem_post(&m_sema) == -1);
+	}
+
+	void signal(int count)
+	{
+		while (count-- > 0)
+		{
+			while (sem_post(&m_sema) == -1);
+		}
+	}
+};
+#else
+#error Unsupported platform! (No semaphore wrapper available)
+#endif
+
+}	// end namespace details
+
+
+//---------------------------------------------------------
+// LightweightSemaphore
+//---------------------------------------------------------
+class LightweightSemaphore
+{
+public:
+	typedef std::make_signed<std::size_t>::type ssize_t;
+
+private:
+	std::atomic<ssize_t> m_count;
+	details::Semaphore m_sema;
+	int m_maxSpins;
+
+	bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1)
+	{
+		ssize_t oldCount;
+		int spin = m_maxSpins;
+		while (--spin >= 0)
+		{
+			oldCount = m_count.load(std::memory_order_relaxed);
+			if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+				return true;
+			std::atomic_signal_fence(std::memory_order_acquire);	 // Prevent the compiler from collapsing the loop.
+		}
+		oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+		if (oldCount > 0)
+			return true;
+		if (timeout_usecs < 0)
+		{
+			if (m_sema.wait())
+				return true;
+		}
+		if (timeout_usecs > 0 && m_sema.timed_wait((std::uint64_t)timeout_usecs))
+			return true;
+		// At this point, we've timed out waiting for the semaphore, but the
+		// count is still decremented indicating we may still be waiting on
+		// it. So we have to re-adjust the count, but only if the semaphore
+		// wasn't signaled enough times for us too since then. If it was, we
+		// need to release the semaphore too.
+		while (true)
+		{
+			oldCount = m_count.load(std::memory_order_acquire);
+			if (oldCount >= 0 && m_sema.try_wait())
+				return true;
+			if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+				return false;
+		}
+	}
+
+	ssize_t waitManyWithPartialSpinning(ssize_t max, std::int64_t timeout_usecs = -1)
+	{
+		assert(max > 0);
+		ssize_t oldCount;
+		int spin = m_maxSpins;
+		while (--spin >= 0)
+		{
+			oldCount = m_count.load(std::memory_order_relaxed);
+			if (oldCount > 0)
+			{
+				ssize_t newCount = oldCount > max ? oldCount - max : 0;
+				if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+					return oldCount - newCount;
+			}
+			std::atomic_signal_fence(std::memory_order_acquire);
+		}
+		oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+		if (oldCount <= 0)
+		{
+			if ((timeout_usecs == 0) || (timeout_usecs < 0 && !m_sema.wait()) || (timeout_usecs > 0 && !m_sema.timed_wait((std::uint64_t)timeout_usecs)))
+			{
+				while (true)
+				{
+					oldCount = m_count.load(std::memory_order_acquire);
+					if (oldCount >= 0 && m_sema.try_wait())
+						break;
+					if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+						return 0;
+				}
+			}
+		}
+		if (max > 1)
+			return 1 + tryWaitMany(max - 1);
+		return 1;
+	}
+
+public:
+	LightweightSemaphore(ssize_t initialCount = 0, int maxSpins = 10000) : m_count(initialCount), m_maxSpins(maxSpins)
+	{
+		assert(initialCount >= 0);
+		assert(maxSpins >= 0);
+	}
+
+	bool tryWait()
+	{
+		ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+		while (oldCount > 0)
+		{
+			if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+				return true;
+		}
+		return false;
+	}
+
+	bool wait()
+	{
+		return tryWait() || waitWithPartialSpinning();
+	}
+
+	bool wait(std::int64_t timeout_usecs)
+	{
+		return tryWait() || waitWithPartialSpinning(timeout_usecs);
+	}
+
+	// Acquires between 0 and (greedily) max, inclusive
+	ssize_t tryWaitMany(ssize_t max)
+	{
+		assert(max >= 0);
+		ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+		while (oldCount > 0)
+		{
+			ssize_t newCount = oldCount > max ? oldCount - max : 0;
+			if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+				return oldCount - newCount;
+		}
+		return 0;
+	}
+
+	// Acquires at least one, and (greedily) at most max
+	ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs)
+	{
+		assert(max >= 0);
+		ssize_t result = tryWaitMany(max);
+		if (result == 0 && max > 0)
+			result = waitManyWithPartialSpinning(max, timeout_usecs);
+		return result;
+	}
+	
+	ssize_t waitMany(ssize_t max)
+	{
+		ssize_t result = waitMany(max, -1);
+		assert(result > 0);
+		return result;
+	}
+
+	void signal(ssize_t count = 1)
+	{
+		assert(count >= 0);
+		ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release);
+		ssize_t toRelease = -oldCount < count ? -oldCount : count;
+		if (toRelease > 0)
+		{
+			m_sema.signal((int)toRelease);
+		}
+	}
+	
+	std::size_t availableApprox() const
+	{
+		ssize_t count = m_count.load(std::memory_order_relaxed);
+		return count > 0 ? static_cast<std::size_t>(count) : 0;
+	}
+};
+
+}   // end namespace moodycamel
--- a/libfuse/lib/syslog.c
+++ b/libfuse/lib/syslog.c
@ -58,6 +58,17 @@ syslog_log(const int   priority_,
  va_end(valist);
 }

+void
+syslog_debug(const char *format_,
+             ...)
+{
+  va_list valist;
+
+  va_start(valist,format_);
+  syslog_vlog(LOG_DEBUG,format_,valist);
+  va_end(valist);
+}
+
 void
 syslog_info(const char *format_,
            ...)
--- a/libfuse/lib/syslog.h
+++ b/libfuse/lib/syslog.h
@ -23,6 +23,7 @@

 void syslog_open();
 void syslog_log(const int priority, const char *format, ...);
+void syslog_debug(const char *format, ...);
 void syslog_info(const char *format, ...);
 void syslog_notice(const char *format, ...);
 void syslog_warning(const char *format, ...);
--- a/src/fuse_readdir_cor.cpp
+++ b/src/fuse_readdir_cor.cpp
@ -117,7 +117,7 @@ namespace l

  static
  std::vector<int>
-  concurrent_readdir(ThreadPool           &tp_,
+  concurrent_readdir(UnboundedThreadPool  &tp_,
                     const Branches::CPtr &branches_,
                     const char           *dirname_,
                     fuse_dirents_t       *buf_,
@ -171,7 +171,7 @@ namespace l

  static
  int
-  readdir(ThreadPool           &tp_,
+  readdir(UnboundedThreadPool  &tp_,
          const Branches::CPtr &branches_,
          const char           *dirname_,
          fuse_dirents_t       *buf_,
--- a/src/fuse_readdir_cor.hpp
+++ b/src/fuse_readdir_cor.hpp
@ -34,6 +34,6 @@ namespace FUSE
                   fuse_dirents_t         *buf);

  private:
-    ThreadPool _tp;
+    UnboundedThreadPool _tp;
  };
 }
--- a/src/fuse_readdir_cosr.cpp
+++ b/src/fuse_readdir_cosr.cpp
@ -63,7 +63,7 @@ namespace l
  static
  inline
  std::vector<std::future<DIR*>>
-  opendir(ThreadPool           &tp_,
+  opendir(UnboundedThreadPool  &tp_,
          const Branches::CPtr &branches_,
          char const           *dirname_,
          uid_t const           uid_,
@ -148,7 +148,7 @@ namespace l
  static
  inline
  int
-  readdir(ThreadPool           &tp_,
+  readdir(UnboundedThreadPool  &tp_,
          const Branches::CPtr &branches_,
          const char           *dirname_,
          fuse_dirents_t       *buf_,
--- a/src/fuse_readdir_cosr.hpp
+++ b/src/fuse_readdir_cosr.hpp
@ -34,6 +34,6 @@ namespace FUSE
                   fuse_dirents_t         *buf);

  private:
-    ThreadPool _tp;
+    UnboundedThreadPool _tp;
  };
 }
--- a/src/moodycamel/blockingconcurrentqueue.h
+++ b/src/moodycamel/blockingconcurrentqueue.h
@ -0,0 +1,582 @@
+// Provides an efficient blocking version of moodycamel::ConcurrentQueue.
+// ©2015-2020 Cameron Desrochers. Distributed under the terms of the simplified
+// BSD license, available at the top of concurrentqueue.h.
+// Also dual-licensed under the Boost Software License (see LICENSE.md)
+// Uses Jeff Preshing's semaphore implementation (under the terms of its
+// separate zlib license, see lightweightsemaphore.h).
+
+#pragma once
+
+#include "concurrentqueue.h"
+#include "lightweightsemaphore.h"
+
+#include <type_traits>
+#include <cerrno>
+#include <memory>
+#include <chrono>
+#include <ctime>
+
+namespace moodycamel
+{
+// This is a blocking version of the queue. It has an almost identical interface to
+// the normal non-blocking version, with the addition of various wait_dequeue() methods
+// and the removal of producer-specific dequeue methods.
+template<typename T, typename Traits = ConcurrentQueueDefaultTraits>
+class BlockingConcurrentQueue
+{
+private:
+	typedef ::moodycamel::ConcurrentQueue<T, Traits> ConcurrentQueue;
+	typedef ::moodycamel::LightweightSemaphore LightweightSemaphore;
+
+public:
+	typedef typename ConcurrentQueue::producer_token_t producer_token_t;
+	typedef typename ConcurrentQueue::consumer_token_t consumer_token_t;
+	
+	typedef typename ConcurrentQueue::index_t index_t;
+	typedef typename ConcurrentQueue::size_t size_t;
+	typedef typename std::make_signed<size_t>::type ssize_t;
+	
+	static const size_t BLOCK_SIZE = ConcurrentQueue::BLOCK_SIZE;
+	static const size_t EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD = ConcurrentQueue::EXPLICIT_BLOCK_EMPTY_COUNTER_THRESHOLD;
+	static const size_t EXPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::EXPLICIT_INITIAL_INDEX_SIZE;
+	static const size_t IMPLICIT_INITIAL_INDEX_SIZE = ConcurrentQueue::IMPLICIT_INITIAL_INDEX_SIZE;
+	static const size_t INITIAL_IMPLICIT_PRODUCER_HASH_SIZE = ConcurrentQueue::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE;
+	static const std::uint32_t EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE = ConcurrentQueue::EXPLICIT_CONSUMER_CONSUMPTION_QUOTA_BEFORE_ROTATE;
+	static const size_t MAX_SUBQUEUE_SIZE = ConcurrentQueue::MAX_SUBQUEUE_SIZE;
+	
+public:
+	// Creates a queue with at least `capacity` element slots; note that the
+	// actual number of elements that can be inserted without additional memory
+	// allocation depends on the number of producers and the block size (e.g. if
+	// the block size is equal to `capacity`, only a single block will be allocated
+	// up-front, which means only a single producer will be able to enqueue elements
+	// without an extra allocation -- blocks aren't shared between producers).
+	// This method is not thread safe -- it is up to the user to ensure that the
+	// queue is fully constructed before it starts being used by other threads (this
+	// includes making the memory effects of construction visible, possibly with a
+	// memory barrier).
+	explicit BlockingConcurrentQueue(size_t capacity = 6 * BLOCK_SIZE)
+		: inner(capacity), sema(create<LightweightSemaphore, ssize_t, int>(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+	{
+		assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+		if (!sema) {
+			MOODYCAMEL_THROW(std::bad_alloc());
+		}
+	}
+	
+	BlockingConcurrentQueue(size_t minCapacity, size_t maxExplicitProducers, size_t maxImplicitProducers)
+		: inner(minCapacity, maxExplicitProducers, maxImplicitProducers), sema(create<LightweightSemaphore, ssize_t, int>(0, (int)Traits::MAX_SEMA_SPINS), &BlockingConcurrentQueue::template destroy<LightweightSemaphore>)
+	{
+		assert(reinterpret_cast<ConcurrentQueue*>((BlockingConcurrentQueue*)1) == &((BlockingConcurrentQueue*)1)->inner && "BlockingConcurrentQueue must have ConcurrentQueue as its first member");
+		if (!sema) {
+			MOODYCAMEL_THROW(std::bad_alloc());
+		}
+	}
+	
+	// Disable copying and copy assignment
+	BlockingConcurrentQueue(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	BlockingConcurrentQueue& operator=(BlockingConcurrentQueue const&) MOODYCAMEL_DELETE_FUNCTION;
+	
+	// Moving is supported, but note that it is *not* a thread-safe operation.
+	// Nobody can use the queue while it's being moved, and the memory effects
+	// of that move must be propagated to other threads before they can use it.
+	// Note: When a queue is moved, its tokens are still valid but can only be
+	// used with the destination queue (i.e. semantically they are moved along
+	// with the queue itself).
+	BlockingConcurrentQueue(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+		: inner(std::move(other.inner)), sema(std::move(other.sema))
+	{ }
+	
+	inline BlockingConcurrentQueue& operator=(BlockingConcurrentQueue&& other) MOODYCAMEL_NOEXCEPT
+	{
+		return swap_internal(other);
+	}
+	
+	// Swaps this queue's state with the other's. Not thread-safe.
+	// Swapping two queues does not invalidate their tokens, however
+	// the tokens that were created for one queue must be used with
+	// only the swapped queue (i.e. the tokens are tied to the
+	// queue's movable state, not the object itself).
+	inline void swap(BlockingConcurrentQueue& other) MOODYCAMEL_NOEXCEPT
+	{
+		swap_internal(other);
+	}
+	
+private:
+	BlockingConcurrentQueue& swap_internal(BlockingConcurrentQueue& other)
+	{
+		if (this == &other) {
+			return *this;
+		}
+		
+		inner.swap(other.inner);
+		sema.swap(other.sema);
+		return *this;
+	}
+	
+public:
+	// Enqueues a single item (by copying it).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T const& item)
+	{
+		if ((details::likely)(inner.enqueue(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Allocates memory if required. Only fails if memory allocation fails (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0,
+	// or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(T&& item)
+	{
+		if ((details::likely)(inner.enqueue(std::move(item)))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T const& item)
+	{
+		if ((details::likely)(inner.enqueue(token, item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Thread-safe.
+	inline bool enqueue(producer_token_t const& token, T&& item)
+	{
+		if ((details::likely)(inner.enqueue(token, std::move(item)))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items.
+	// Allocates memory if required. Only fails if memory allocation fails (or
+	// implicit production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0, or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool enqueue_bulk(It itemFirst, size_t count)
+	{
+		if ((details::likely)(inner.enqueue_bulk(std::forward<It>(itemFirst), count))) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Allocates memory if required. Only fails if memory allocation fails
+	// (or Traits::MAX_SUBQUEUE_SIZE has been defined and would be surpassed).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		if ((details::likely)(inner.enqueue_bulk(token, std::forward<It>(itemFirst), count))) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it).
+	// Does not allocate memory. Fails if not enough room to enqueue (or implicit
+	// production is disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE
+	// is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T const& item)
+	{
+		if (inner.try_enqueue(item)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible).
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Thread-safe.
+	inline bool try_enqueue(T&& item)
+	{
+		if (inner.try_enqueue(std::move(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by copying it) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T const& item)
+	{
+		if (inner.try_enqueue(token, item)) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues a single item (by moving it, if possible) using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Thread-safe.
+	inline bool try_enqueue(producer_token_t const& token, T&& item)
+	{
+		if (inner.try_enqueue(token, std::move(item))) {
+			sema->signal();
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items.
+	// Does not allocate memory (except for one-time implicit producer).
+	// Fails if not enough room to enqueue (or implicit production is
+	// disabled because Traits::INITIAL_IMPLICIT_PRODUCER_HASH_SIZE is 0).
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool try_enqueue_bulk(It itemFirst, size_t count)
+	{
+		if (inner.try_enqueue_bulk(std::forward<It>(itemFirst), count)) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	// Enqueues several items using an explicit producer token.
+	// Does not allocate memory. Fails if not enough room to enqueue.
+	// Note: Use std::make_move_iterator if the elements should be moved
+	// instead of copied.
+	// Thread-safe.
+	template<typename It>
+	inline bool try_enqueue_bulk(producer_token_t const& token, It itemFirst, size_t count)
+	{
+		if (inner.try_enqueue_bulk(token, std::forward<It>(itemFirst), count)) {
+			sema->signal((LightweightSemaphore::ssize_t)(ssize_t)count);
+			return true;
+		}
+		return false;
+	}
+	
+	
+	// Attempts to dequeue from the queue.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue(U& item)
+	{
+		if (sema->tryWait()) {
+			while (!inner.try_dequeue(item)) {
+				continue;
+			}
+			return true;
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue from the queue using an explicit consumer token.
+	// Returns false if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool try_dequeue(consumer_token_t& token, U& item)
+	{
+		if (sema->tryWait()) {
+			while (!inner.try_dequeue(token, item)) {
+				continue;
+			}
+			return true;
+		}
+		return false;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued.
+	// Returns 0 if all producer streams appeared empty at the time they
+	// were checked (so, the queue is likely but not guaranteed to be empty).
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t try_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->tryWaitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	
+	
+	// Blocks the current thread until there's something to dequeue, then
+	// dequeues it.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline void wait_dequeue(U& item)
+	{
+		while (!sema->wait()) {
+			continue;
+		}
+		while (!inner.try_dequeue(item)) {
+			continue;
+		}
+	}
+
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout (specified in microseconds) expires. Returns false
+	// without setting `item` if the timeout expires, otherwise assigns
+	// to `item` and returns true.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool wait_dequeue_timed(U& item, std::int64_t timeout_usecs)
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		while (!inner.try_dequeue(item)) {
+			continue;
+		}
+		return true;
+	}
+    
+    // Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+	// Never allocates. Thread-safe.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Blocks the current thread until there's something to dequeue, then
+	// dequeues it using an explicit consumer token.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline void wait_dequeue(consumer_token_t& token, U& item)
+	{
+		while (!sema->wait()) {
+			continue;
+		}
+		while (!inner.try_dequeue(token, item)) {
+			continue;
+		}
+	}
+	
+	// Blocks the current thread until either there's something to dequeue
+	// or the timeout (specified in microseconds) expires. Returns false
+	// without setting `item` if the timeout expires, otherwise assigns
+	// to `item` and returns true.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue.
+	// Never allocates. Thread-safe.
+	template<typename U>
+	inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::int64_t timeout_usecs)
+	{
+		if (!sema->wait(timeout_usecs)) {
+			return false;
+		}
+		while (!inner.try_dequeue(token, item)) {
+			continue;
+		}
+		return true;
+	}
+    
+    // Blocks the current thread until either there's something to dequeue
+	// or the timeout expires. Returns false without setting `item` if the
+    // timeout expires, otherwise assigns to `item` and returns true.
+	// Never allocates. Thread-safe.
+	template<typename U, typename Rep, typename Period>
+	inline bool wait_dequeue_timed(consumer_token_t& token, U& item, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_timed(token, item, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which will
+	// always be at least one (this method blocks until the queue
+	// is non-empty) and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk(It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue_bulk.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::int64_t timeout_usecs)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(itemFirst, max - count);
+		}
+		return count;
+	}
+    
+    // Attempts to dequeue several elements from the queue.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It, typename Rep, typename Period>
+	inline size_t wait_dequeue_bulk_timed(It itemFirst, size_t max, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which will
+	// always be at least one (this method blocks until the queue
+	// is non-empty) and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk(consumer_token_t& token, It itemFirst, size_t max)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Using a negative timeout indicates an indefinite timeout,
+	// and is thus functionally equivalent to calling wait_dequeue_bulk.
+	// Never allocates. Thread-safe.
+	template<typename It>
+	inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::int64_t timeout_usecs)
+	{
+		size_t count = 0;
+		max = (size_t)sema->waitMany((LightweightSemaphore::ssize_t)(ssize_t)max, timeout_usecs);
+		while (count != max) {
+			count += inner.template try_dequeue_bulk<It&>(token, itemFirst, max - count);
+		}
+		return count;
+	}
+	
+	// Attempts to dequeue several elements from the queue using an explicit consumer token.
+	// Returns the number of items actually dequeued, which can
+	// be 0 if the timeout expires while waiting for elements,
+	// and at most max.
+	// Never allocates. Thread-safe.
+	template<typename It, typename Rep, typename Period>
+	inline size_t wait_dequeue_bulk_timed(consumer_token_t& token, It itemFirst, size_t max, std::chrono::duration<Rep, Period> const& timeout)
+    {
+        return wait_dequeue_bulk_timed<It&>(token, itemFirst, max, std::chrono::duration_cast<std::chrono::microseconds>(timeout).count());
+    }
+	
+	
+	// Returns an estimate of the total number of elements currently in the queue. This
+	// estimate is only accurate if the queue has completely stabilized before it is called
+	// (i.e. all enqueue and dequeue operations have completed and their memory effects are
+	// visible on the calling thread, and no further operations start while this method is
+	// being called).
+	// Thread-safe.
+	inline size_t size_approx() const
+	{
+		return (size_t)sema->availableApprox();
+	}
+	
+	
+	// Returns true if the underlying atomic variables used by
+	// the queue are lock-free (they should be on most platforms).
+	// Thread-safe.
+	static constexpr bool is_lock_free()
+	{
+		return ConcurrentQueue::is_lock_free();
+	}
+	
+
+private:
+	template<typename U, typename A1, typename A2>
+	static inline U* create(A1&& a1, A2&& a2)
+	{
+		void* p = (Traits::malloc)(sizeof(U));
+		return p != nullptr ? new (p) U(std::forward<A1>(a1), std::forward<A2>(a2)) : nullptr;
+	}
+	
+	template<typename U>
+	static inline void destroy(U* p)
+	{
+		if (p != nullptr) {
+			p->~U();
+		}
+		(Traits::free)(p);
+	}
+	
+private:
+	ConcurrentQueue inner;
+	std::unique_ptr<LightweightSemaphore, void (*)(LightweightSemaphore*)> sema;
+};
+
+
+template<typename T, typename Traits>
+inline void swap(BlockingConcurrentQueue<T, Traits>& a, BlockingConcurrentQueue<T, Traits>& b) MOODYCAMEL_NOEXCEPT
+{
+	a.swap(b);
+}
+
+}	// end namespace moodycamel
--- a/src/moodycamel/concurrentqueue.h
+++ b/src/moodycamel/concurrentqueue.h
--- a/src/moodycamel/lightweightsemaphore.h
+++ b/src/moodycamel/lightweightsemaphore.h
@ -0,0 +1,427 @@
+// Provides an efficient implementation of a semaphore (LightweightSemaphore).
+// This is an extension of Jeff Preshing's sempahore implementation (licensed 
+// under the terms of its separate zlib license) that has been adapted and
+// extended by Cameron Desrochers.
+
+#pragma once
+
+#include <cstddef> // For std::size_t
+#include <atomic>
+#include <type_traits> // For std::make_signed<T>
+
+#if defined(_WIN32)
+// Avoid including windows.h in a header; we only need a handful of
+// items, so we'll redeclare them here (this is relatively safe since
+// the API generally has to remain stable between Windows versions).
+// I know this is an ugly hack but it still beats polluting the global
+// namespace with thousands of generic names or adding a .cpp for nothing.
+extern "C" {
+	struct _SECURITY_ATTRIBUTES;
+	__declspec(dllimport) void* __stdcall CreateSemaphoreW(_SECURITY_ATTRIBUTES* lpSemaphoreAttributes, long lInitialCount, long lMaximumCount, const wchar_t* lpName);
+	__declspec(dllimport) int __stdcall CloseHandle(void* hObject);
+	__declspec(dllimport) unsigned long __stdcall WaitForSingleObject(void* hHandle, unsigned long dwMilliseconds);
+	__declspec(dllimport) int __stdcall ReleaseSemaphore(void* hSemaphore, long lReleaseCount, long* lpPreviousCount);
+}
+#elif defined(__MACH__)
+#include <mach/mach.h>
+#elif defined(__MVS__)
+#include <zos-semaphore.h>
+#elif defined(__unix__)
+#include <semaphore.h>
+
+#if defined(__GLIBC_PREREQ) && defined(_GNU_SOURCE)
+#if __GLIBC_PREREQ(2,30)
+#define MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+#endif
+#endif
+#endif
+
+namespace moodycamel
+{
+namespace details
+{
+
+// Code in the mpmc_sema namespace below is an adaptation of Jeff Preshing's
+// portable + lightweight semaphore implementations, originally from
+// https://github.com/preshing/cpp11-on-multicore/blob/master/common/sema.h
+// LICENSE:
+// Copyright (c) 2015 Jeff Preshing
+//
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+//
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+//
+// 1. The origin of this software must not be misrepresented; you must not
+//	claim that you wrote the original software. If you use this software
+//	in a product, an acknowledgement in the product documentation would be
+//	appreciated but is not required.
+// 2. Altered source versions must be plainly marked as such, and must not be
+//	misrepresented as being the original software.
+// 3. This notice may not be removed or altered from any source distribution.
+#if defined(_WIN32)
+class Semaphore
+{
+private:
+	void* m_hSema;
+	
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		const long maxLong = 0x7fffffff;
+		m_hSema = CreateSemaphoreW(nullptr, initialCount, maxLong, nullptr);
+		assert(m_hSema);
+	}
+
+	~Semaphore()
+	{
+		CloseHandle(m_hSema);
+	}
+
+	bool wait()
+	{
+		const unsigned long infinite = 0xffffffff;
+		return WaitForSingleObject(m_hSema, infinite) == 0;
+	}
+	
+	bool try_wait()
+	{
+		return WaitForSingleObject(m_hSema, 0) == 0;
+	}
+	
+	bool timed_wait(std::uint64_t usecs)
+	{
+		return WaitForSingleObject(m_hSema, (unsigned long)(usecs / 1000)) == 0;
+	}
+
+	void signal(int count = 1)
+	{
+		while (!ReleaseSemaphore(m_hSema, count, nullptr));
+	}
+};
+#elif defined(__MACH__)
+//---------------------------------------------------------
+// Semaphore (Apple iOS and OSX)
+// Can't use POSIX semaphores due to http://lists.apple.com/archives/darwin-kernel/2009/Apr/msg00010.html
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+	semaphore_t m_sema;
+
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		kern_return_t rc = semaphore_create(mach_task_self(), &m_sema, SYNC_POLICY_FIFO, initialCount);
+		assert(rc == KERN_SUCCESS);
+		(void)rc;
+	}
+
+	~Semaphore()
+	{
+		semaphore_destroy(mach_task_self(), m_sema);
+	}
+
+	bool wait()
+	{
+		return semaphore_wait(m_sema) == KERN_SUCCESS;
+	}
+	
+	bool try_wait()
+	{
+		return timed_wait(0);
+	}
+	
+	bool timed_wait(std::uint64_t timeout_usecs)
+	{
+		mach_timespec_t ts;
+		ts.tv_sec = static_cast<unsigned int>(timeout_usecs / 1000000);
+		ts.tv_nsec = static_cast<int>((timeout_usecs % 1000000) * 1000);
+
+		// added in OSX 10.10: https://developer.apple.com/library/prerelease/mac/documentation/General/Reference/APIDiffsMacOSX10_10SeedDiff/modules/Darwin.html
+		kern_return_t rc = semaphore_timedwait(m_sema, ts);
+		return rc == KERN_SUCCESS;
+	}
+
+	void signal()
+	{
+		while (semaphore_signal(m_sema) != KERN_SUCCESS);
+	}
+
+	void signal(int count)
+	{
+		while (count-- > 0)
+		{
+			while (semaphore_signal(m_sema) != KERN_SUCCESS);
+		}
+	}
+};
+#elif defined(__unix__) || defined(__MVS__)
+//---------------------------------------------------------
+// Semaphore (POSIX, Linux, zOS)
+//---------------------------------------------------------
+class Semaphore
+{
+private:
+	sem_t m_sema;
+
+	Semaphore(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+	Semaphore& operator=(const Semaphore& other) MOODYCAMEL_DELETE_FUNCTION;
+
+public:
+	Semaphore(int initialCount = 0)
+	{
+		assert(initialCount >= 0);
+		int rc = sem_init(&m_sema, 0, static_cast<unsigned int>(initialCount));
+		assert(rc == 0);
+		(void)rc;
+	}
+
+	~Semaphore()
+	{
+		sem_destroy(&m_sema);
+	}
+
+	bool wait()
+	{
+		// http://stackoverflow.com/questions/2013181/gdb-causes-sem-wait-to-fail-with-eintr-error
+		int rc;
+		do {
+			rc = sem_wait(&m_sema);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	bool try_wait()
+	{
+		int rc;
+		do {
+			rc = sem_trywait(&m_sema);
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	bool timed_wait(std::uint64_t usecs)
+	{
+		struct timespec ts;
+		const int usecs_in_1_sec = 1000000;
+		const int nsecs_in_1_sec = 1000000000;
+#ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+		clock_gettime(CLOCK_MONOTONIC, &ts);
+#else
+		clock_gettime(CLOCK_REALTIME, &ts);
+#endif
+		ts.tv_sec += (time_t)(usecs / usecs_in_1_sec);
+		ts.tv_nsec += (long)(usecs % usecs_in_1_sec) * 1000;
+		// sem_timedwait bombs if you have more than 1e9 in tv_nsec
+		// so we have to clean things up before passing it in
+		if (ts.tv_nsec >= nsecs_in_1_sec) {
+			ts.tv_nsec -= nsecs_in_1_sec;
+			++ts.tv_sec;
+		}
+
+		int rc;
+		do {
+#ifdef MOODYCAMEL_LIGHTWEIGHTSEMAPHORE_MONOTONIC
+			rc = sem_clockwait(&m_sema, CLOCK_MONOTONIC, &ts);
+#else
+			rc = sem_timedwait(&m_sema, &ts);
+#endif
+		} while (rc == -1 && errno == EINTR);
+		return rc == 0;
+	}
+
+	void signal()
+	{
+		while (sem_post(&m_sema) == -1);
+	}
+
+	void signal(int count)
+	{
+		while (count-- > 0)
+		{
+			while (sem_post(&m_sema) == -1);
+		}
+	}
+};
+#else
+#error Unsupported platform! (No semaphore wrapper available)
+#endif
+
+}	// end namespace details
+
+
+//---------------------------------------------------------
+// LightweightSemaphore
+//---------------------------------------------------------
+class LightweightSemaphore
+{
+public:
+	typedef std::make_signed<std::size_t>::type ssize_t;
+
+private:
+	std::atomic<ssize_t> m_count;
+	details::Semaphore m_sema;
+	int m_maxSpins;
+
+	bool waitWithPartialSpinning(std::int64_t timeout_usecs = -1)
+	{
+		ssize_t oldCount;
+		int spin = m_maxSpins;
+		while (--spin >= 0)
+		{
+			oldCount = m_count.load(std::memory_order_relaxed);
+			if ((oldCount > 0) && m_count.compare_exchange_strong(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+				return true;
+			std::atomic_signal_fence(std::memory_order_acquire);	 // Prevent the compiler from collapsing the loop.
+		}
+		oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+		if (oldCount > 0)
+			return true;
+		if (timeout_usecs < 0)
+		{
+			if (m_sema.wait())
+				return true;
+		}
+		if (timeout_usecs > 0 && m_sema.timed_wait((std::uint64_t)timeout_usecs))
+			return true;
+		// At this point, we've timed out waiting for the semaphore, but the
+		// count is still decremented indicating we may still be waiting on
+		// it. So we have to re-adjust the count, but only if the semaphore
+		// wasn't signaled enough times for us too since then. If it was, we
+		// need to release the semaphore too.
+		while (true)
+		{
+			oldCount = m_count.load(std::memory_order_acquire);
+			if (oldCount >= 0 && m_sema.try_wait())
+				return true;
+			if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+				return false;
+		}
+	}
+
+	ssize_t waitManyWithPartialSpinning(ssize_t max, std::int64_t timeout_usecs = -1)
+	{
+		assert(max > 0);
+		ssize_t oldCount;
+		int spin = m_maxSpins;
+		while (--spin >= 0)
+		{
+			oldCount = m_count.load(std::memory_order_relaxed);
+			if (oldCount > 0)
+			{
+				ssize_t newCount = oldCount > max ? oldCount - max : 0;
+				if (m_count.compare_exchange_strong(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+					return oldCount - newCount;
+			}
+			std::atomic_signal_fence(std::memory_order_acquire);
+		}
+		oldCount = m_count.fetch_sub(1, std::memory_order_acquire);
+		if (oldCount <= 0)
+		{
+			if ((timeout_usecs == 0) || (timeout_usecs < 0 && !m_sema.wait()) || (timeout_usecs > 0 && !m_sema.timed_wait((std::uint64_t)timeout_usecs)))
+			{
+				while (true)
+				{
+					oldCount = m_count.load(std::memory_order_acquire);
+					if (oldCount >= 0 && m_sema.try_wait())
+						break;
+					if (oldCount < 0 && m_count.compare_exchange_strong(oldCount, oldCount + 1, std::memory_order_relaxed, std::memory_order_relaxed))
+						return 0;
+				}
+			}
+		}
+		if (max > 1)
+			return 1 + tryWaitMany(max - 1);
+		return 1;
+	}
+
+public:
+	LightweightSemaphore(ssize_t initialCount = 0, int maxSpins = 10000) : m_count(initialCount), m_maxSpins(maxSpins)
+	{
+		assert(initialCount >= 0);
+		assert(maxSpins >= 0);
+	}
+
+	bool tryWait()
+	{
+		ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+		while (oldCount > 0)
+		{
+			if (m_count.compare_exchange_weak(oldCount, oldCount - 1, std::memory_order_acquire, std::memory_order_relaxed))
+				return true;
+		}
+		return false;
+	}
+
+	bool wait()
+	{
+		return tryWait() || waitWithPartialSpinning();
+	}
+
+	bool wait(std::int64_t timeout_usecs)
+	{
+		return tryWait() || waitWithPartialSpinning(timeout_usecs);
+	}
+
+	// Acquires between 0 and (greedily) max, inclusive
+	ssize_t tryWaitMany(ssize_t max)
+	{
+		assert(max >= 0);
+		ssize_t oldCount = m_count.load(std::memory_order_relaxed);
+		while (oldCount > 0)
+		{
+			ssize_t newCount = oldCount > max ? oldCount - max : 0;
+			if (m_count.compare_exchange_weak(oldCount, newCount, std::memory_order_acquire, std::memory_order_relaxed))
+				return oldCount - newCount;
+		}
+		return 0;
+	}
+
+	// Acquires at least one, and (greedily) at most max
+	ssize_t waitMany(ssize_t max, std::int64_t timeout_usecs)
+	{
+		assert(max >= 0);
+		ssize_t result = tryWaitMany(max);
+		if (result == 0 && max > 0)
+			result = waitManyWithPartialSpinning(max, timeout_usecs);
+		return result;
+	}
+	
+	ssize_t waitMany(ssize_t max)
+	{
+		ssize_t result = waitMany(max, -1);
+		assert(result > 0);
+		return result;
+	}
+
+	void signal(ssize_t count = 1)
+	{
+		assert(count >= 0);
+		ssize_t oldCount = m_count.fetch_add(count, std::memory_order_release);
+		ssize_t toRelease = -oldCount < count ? -oldCount : count;
+		if (toRelease > 0)
+		{
+			m_sema.signal((int)toRelease);
+		}
+	}
+	
+	std::size_t availableApprox() const
+	{
+		ssize_t count = m_count.load(std::memory_order_relaxed);
+		return count > 0 ? static_cast<std::size_t>(count) : 0;
+	}
+};
+
+}   // end namespace moodycamel
--- a/src/syslog.cpp
+++ b/src/syslog.cpp
@ -56,6 +56,17 @@ syslog_log(const int   priority_,
  va_end(valist);
 }

+void
+syslog_debug(const char *format_,
+             ...)
+{
+  va_list valist;
+
+  va_start(valist,format_);
+  syslog_log(LOG_DEBUG,format_,valist);
+  va_end(valist);
+}
+
 void
 syslog_info(const char *format_,
            ...)
--- a/src/syslog.hpp
+++ b/src/syslog.hpp
@ -23,6 +23,7 @@

 void syslog_open();
 void syslog_log(const int priority, const char *format, ...);
+void syslog_debug(const char *format, ...);
 void syslog_info(const char *format, ...);
 void syslog_notice(const char *format, ...);
 void syslog_warning(const char *format, ...);
--- a/src/unbounded_queue.hpp
+++ b/src/unbounded_queue.hpp
@ -1,161 +0,0 @@
-#pragma once
-
-#include <condition_variable>
-#include <mutex>
-#include <queue>
-#include <utility>
-
-
-template<typename T>
-class UnboundedQueue
-{
-public:
-  explicit
-  UnboundedQueue(bool block_ = true)
-    : _block(block_)
-  {
-  }
-
-  void
-  push(const T& item_)
-  {
-    {
-      std::lock_guard<std::mutex> guard(_queue_lock);
-      _queue.push(item_);
-    }
-    _condition.notify_one();
-  }
-
-  void
-  push(T&& item_)
-  {
-    {
-      std::lock_guard<std::mutex> guard(_queue_lock);
-      _queue.push(std::move(item_));
-    }
-
-    _condition.notify_one();
-  }
-
-  template<typename... Args>
-  void
-  emplace(Args&&... args_)
-  {
-    {
-      std::lock_guard<std::mutex> guard(_queue_lock);
-      _queue.emplace(std::forward<Args>(args_)...);
-    }
-
-    _condition.notify_one();
-  }
-
-  bool
-  try_push(const T& item_)
-  {
-    {
-      std::unique_lock<std::mutex> lock(_queue_lock, std::try_to_lock);
-      if(!lock)
-        return false;
-      _queue.push(item_);
-    }
-
-    _condition.notify_one();
-
-    return true;
-  }
-
-  bool
-  try_push(T&& item_)
-  {
-    {
-      std::unique_lock<std::mutex> lock(_queue_lock, std::try_to_lock);
-      if(!lock)
-        return false;
-      _queue.push(std::move(item_));
-    }
-
-    _condition.notify_one();
-
-    return true;
-  }
-
-  //TODO: push multiple T at once
-
-  bool
-  pop(T& item_)
-  {
-    std::unique_lock<std::mutex> guard(_queue_lock);
-
-    _condition.wait(guard, [&]() { return !_queue.empty() || !_block; });
-    if(_queue.empty())
-      return false;
-
-    item_ = std::move(_queue.front());
-    _queue.pop();
-
-    return true;
-  }
-
-  bool
-  try_pop(T& item_)
-  {
-    std::unique_lock<std::mutex> lock(_queue_lock, std::try_to_lock);
-    if(!lock || _queue.empty())
-      return false;
-
-    item_ = std::move(_queue.front());
-    _queue.pop();
-
-    return true;
-  }
-
-  std::size_t
-  size() const
-  {
-    std::lock_guard<std::mutex> guard(_queue_lock);
-
-    return _queue.size();
-  }
-
-  bool
-  empty() const
-  {
-    std::lock_guard<std::mutex> guard(_queue_lock);
-
-    return _queue.empty();
-  }
-
-  void
-  block()
-  {
-    std::lock_guard<std::mutex> guard(_queue_lock);
-    _block = true;
-  }
-
-  void
-  unblock()
-  {
-    {
-      std::lock_guard<std::mutex> guard(_queue_lock);
-      _block = false;
-    }
-
-    _condition.notify_all();
-  }
-
-  bool
-  blocking() const
-  {
-    std::lock_guard<std::mutex> guard(_queue_lock);
-
-    return _block;
-  }
-
-private:
-  mutable std::mutex _queue_lock;
-
-private:
-  bool _block;
-  std::queue<T> _queue;
-  std::condition_variable _condition;
-};
--- a/src/unbounded_thread_pool.hpp
+++ b/src/unbounded_thread_pool.hpp
@ -1,53 +1,42 @@
 #pragma once

-#include "unbounded_queue.hpp"
-
+#include "moodycamel/blockingconcurrentqueue.h"
 #include "syslog.hpp"

 #include <atomic>
 #include <csignal>
-#include <functional>
+#include <cstring>
 #include <future>
 #include <memory>
+#include <stdexcept>
+#include <string>
 #include <thread>
-#include <tuple>
-#include <type_traits>
-#include <utility>
 #include <vector>


-class ThreadPool
+struct UnboundedThreadPoolTraits : public moodycamel::ConcurrentQueueDefaultTraits
+{
+  static const int MAX_SEMA_SPINS = 1;
+};
+
+
+class UnboundedThreadPool
 {
+private:
+  using Func   = std::function<void(void)>;
+  using Queue  = moodycamel::BlockingConcurrentQueue<Func,UnboundedThreadPoolTraits>;
+
 public:
  explicit
-  ThreadPool(std::size_t const thread_count_ = std::thread::hardware_concurrency(),
-             std::string const name_         = {})
-    : _queues(thread_count_),
-      _count(thread_count_),
+  UnboundedThreadPool(std::size_t const thread_count_ = std::thread::hardware_concurrency(),
+                      std::string const name_         = {})
+    : _queue(thread_count_,thread_count_,thread_count_),
+      _done(false),
      _name(name_)
  {
-    syslog_info("threadpool: spawning %zu threads named '%s'",
-                _count,
-                _name.c_str());
-
-    auto worker = [this](std::size_t i)
-    {
-      while(true)
-        {
-          Proc f;
-
-          for(std::size_t n = 0; n < (_count * K); ++n)
-            {
-              if(_queues[(i + n) % _count].try_pop(f))
-                break;
-            }
-
-          if(!f && !_queues[i].pop(f))
-            break;
-
-          f();
-        }
-    };
+    syslog_debug("threadpool: spawning %zu threads named '%s'",
+                 thread_count_,
+                 _name.c_str());

    sigset_t oldset;
    sigset_t newset;
@ -57,52 +46,95 @@ public:

    _threads.reserve(thread_count_);
    for(std::size_t i = 0; i < thread_count_; ++i)
-      _threads.emplace_back(worker, i);
-    if(!_name.empty())
      {
-        for(auto &t : _threads)
-          pthread_setname_np(t.native_handle(),_name.c_str());
+        int rv;
+        pthread_t t;
+
+        rv = pthread_create(&t,NULL,UnboundedThreadPool::start_routine,this);
+        if(rv != 0)
+          {
+            syslog_warning("threadpool: error spawning thread - %d (%s)",
+                           rv,
+                           strerror(rv));
+            continue;
+          }
+
+        if(!_name.empty())
+          pthread_setname_np(t,_name.c_str());
+
+        _threads.push_back(t);
      }

    pthread_sigmask(SIG_SETMASK,&oldset,NULL);
+
+    if(_threads.empty())
+      throw std::runtime_error("threadpool: failed to spawn any threads");
  }

-  ~ThreadPool()
+  ~UnboundedThreadPool()
  {
-    syslog_info("threadpool: destroying %zu threads named '%s'",
-                _count,
-                _name.c_str());
-
-    for(auto& queue : _queues)
-      queue.unblock();
-    for(auto& thread : _threads)
-      thread.join();
+    syslog_debug("threadpool: destroying %zu threads named '%s'",
+                 _threads.size(),
+                 _name.c_str());
+
+    _done.store(true,std::memory_order_relaxed);
+
+    for(auto t : _threads)
+      pthread_cancel(t);
+
+    Func f;
+    while(_queue.try_dequeue(f))
+      continue;
+
+    for(auto t : _threads)
+      pthread_join(t,NULL);
  }

-  template<typename F>
-  void
-  enqueue_work(F&& f_)
+private:
+  static
+  void*
+  start_routine(void *arg_)
  {
-    auto i = _index.fetch_add(1,std::memory_order_relaxed);
+    UnboundedThreadPool *btp = static_cast<UnboundedThreadPool*>(arg_);
+    UnboundedThreadPool::Func func;
+    std::atomic<bool> &done = btp->_done;
+    UnboundedThreadPool::Queue &q = btp->_queue;
+    moodycamel::ConsumerToken ctok(btp->_queue);

-    for(std::size_t n = 0; n < (_count * K); ++n)
+    while(!done.load(std::memory_order_relaxed))
      {
-        if(_queues[(i + n) % _count].try_push(f_))
-          return;
+        q.wait_dequeue(ctok,func);
+
+        func();
      }

-    _queues[i % _count].push(std::move(f_));
+    return NULL;
+  }
+
+public:
+  template<typename FuncType>
+  void
+  enqueue_work(moodycamel::ProducerToken  &ptok_,
+               FuncType                  &&f_)
+  {
+    _queue.enqueue(ptok_,f_);
+  }
+
+  template<typename FuncType>
+  void
+  enqueue_work(FuncType &&f_)
+  {
+    _queue.enqueue(f_);
  }

-  template<typename F>
+  template<typename FuncType>
  [[nodiscard]]
-  std::future<typename std::result_of<F()>::type>
-  enqueue_task(F&& f_)
+  std::future<typename std::result_of<FuncType()>::type>
+  enqueue_task(FuncType&& f_)
  {
-    using TaskReturnType = typename std::result_of<F()>::type;
+    using TaskReturnType = typename std::result_of<FuncType()>::type;
    using Promise        = std::promise<TaskReturnType>;

-    auto i       = _index.fetch_add(1,std::memory_order_relaxed);
    auto promise = std::make_shared<Promise>();
    auto future  = promise->get_future();
    auto work    = [=]()
@ -111,42 +143,30 @@ public:
      promise->set_value(rv);
    };

-    for(std::size_t n = 0; n < (_count * K); ++n)
-      {
-        if(_queues[(i + n) % _count].try_push(work))
-          return future;
-      }
-
-    _queues[i % _count].push(std::move(work));
+    _queue.enqueue(work);

    return future;
  }

 public:
  std::vector<pthread_t>
-  threads()
+  threads() const
  {
-    std::vector<pthread_t> rv;
-
-    for(auto &thread : _threads)
-      rv.push_back(thread.native_handle());
-
-    return rv;
+    return _threads;
  }

-private:
-  using Proc   = std::function<void(void)>;
-  using Queue  = UnboundedQueue<Proc>;
-  using Queues = std::vector<Queue>;
-  Queues _queues;
+public:
+  Queue&
+  queue()
+  {
+    return _queue;
+  }

 private:
-  std::vector<std::thread> _threads;
+  Queue _queue;

 private:
-  std::size_t const _count;
-  std::atomic_uint  _index;
-  std::string const _name;
-
-  static const unsigned int K = 2;
+  std::atomic<bool>      _done;
+  std::string const      _name;
+  std::vector<pthread_t> _threads;
 };