Taskflow: A General-purpose Task-parallel Programming System: taskflow/utility/object_pool.hpp Source File

#pragma once


#include <array>

#include <cstddef>

#include <cstdint>

#include <memory_resource>

#include <memory>

#include <atomic>

#include <thread>

#include <utility>

#include "os.hpp"


namespace tf {


// ----------------------------------------------------------------------------

// TaggedHead128

// ----------------------------------------------------------------------------


struct TaggedHead128 {


  using pointer_type = uintptr_t;


  using tag_type = uintptr_t;


  pointer_type ptr {0};


  tag_type tag {0};


  TaggedHead128() = default;


  TaggedHead128(pointer_type p, tag_type t) noexcept : ptr{p}, tag{t} {}


  pointer_type get_ptr() const noexcept { return ptr; }


  tag_type get_tag() const noexcept { return tag; }

};


// ----------------------------------------------------------------------------

// TaggedHead64

// ----------------------------------------------------------------------------


template <int PtrBits = TF_POINTER_BITS>


struct TaggedHead64 {

  static_assert(64 - PtrBits >= 16,

    "TaggedHead64 requires at least 16 tag bits for ABA safety "

    "(PtrBits must be <= 48); use TaggedHead128 instead");


  using pointer_type = uintptr_t;


  using tag_type = uint16_t;


  static constexpr int PTR_BITS = PtrBits;


  static constexpr int TAG_BITS = 64 - PtrBits;


  static constexpr pointer_type PTR_MASK = (pointer_type{1} << PTR_BITS) - 1;


  uintptr_t bits {0};


  TaggedHead64() = default;


  TaggedHead64(pointer_type p, tag_type t) noexcept

    : bits{ (p & PTR_MASK) | (static_cast<uintptr_t>(t) << PTR_BITS) } {}


  pointer_type get_ptr() const noexcept { return bits & PTR_MASK; }


  tag_type get_tag() const noexcept { return static_cast<tag_type>(bits >> PTR_BITS); }

};


// ----------------------------------------------------------------------------

// ObjectBlock

// ----------------------------------------------------------------------------


template <typename T>

struct ObjectBlock {


  uint16_t pool_id;


  // Intrusive free-list link. Must be atomic to avoid a formal data race

  // between push_free writing next_free and a concurrent pop_free reading it.

  //

  // The race arises from a *stale pointer* held by a thread that loses a CAS.

  // Consider this interleaving (free list: [b -> null]):

  //

  //   Thread A (pop_free):  loads _free_head = {b, 5}  <-- cur.ptr = b

  //   Thread B (pop_free):  also loads {b, 5}, wins CAS first,

  //                         pops b, returns it to the caller

  //   Thread C (push_free): caller recycles b;

  //                         C writes b->next_free       <-- non-atomic WRITE

  //   Thread A (pop_free):  reads cur.ptr->next_free    <-- non-atomic READ

  //                         (cur.ptr is the stale b!)

  //

  // Thread A's CAS will ultimately fail (the version tag changed), so there

  // is no algorithmic corruption — but the concurrent non-atomic read + write

  // on the same memory location is a formal data race and undefined behavior

  // per the C++ memory model. Making next_free atomic eliminates the race by

  // definition: two concurrent atomic accesses are never a data race.

  std::atomic<ObjectBlock*> next_free {nullptr};


  alignas(T) std::byte storage[sizeof(T)];


  T* object() noexcept {

    return std::launder(reinterpret_cast<T*>(storage));

  }


  const T* object() const noexcept {

    return std::launder(reinterpret_cast<const T*>(storage));

  }


  static ObjectBlock* from_object(T* obj) noexcept {

    return reinterpret_cast<ObjectBlock*>(

      reinterpret_cast<char*>(obj) - offsetof(ObjectBlock, storage)

    );

  }

};


// ----------------------------------------------------------------------------

// ObjectPool

// ----------------------------------------------------------------------------


template <typename T, typename H = TaggedHead128, size_t LogSize = 5>


class ObjectPool {


  static_assert(LogSize >= 1 && LogSize <= 15,

    "LogSize must be in [1, 15]");


  using Block = ObjectBlock<T>;


  static constexpr size_t NumPools = 1u << LogSize;


  struct alignas(TF_CACHELINE_SIZE) Shard {


    // Hot path: lock-free Treiber stack of recycled blocks.

    // _free_head sits on its own cache line (via the Shard alignas) so that

    // hot-path CAS does not invalidate the line holding _backing's mutex.

    std::atomic<H> _free_head {H{}};


    // Cold path: backing allocator for fresh block memory.

    // alignas pushes _backing to the next cache line, separating it from the

    // hot _free_head above and preventing hot/cold false sharing.

    alignas(TF_CACHELINE_SIZE) std::pmr::synchronized_pool_resource _backing {

      std::pmr::pool_options {

        .max_blocks_per_chunk        = 1024,

        .largest_required_pool_block = sizeof(Block)

      }

    };


    void push_free(Block* b) noexcept {

      H cur = _free_head.load(std::memory_order_relaxed);

      H next;

      do {

        // relaxed: the release CAS below synchronises-with pop_free's acquire,

        // making this store visible to any thread that subsequently observes b

        // at the head of the list.

        b->next_free.store(

          reinterpret_cast<Block*>(cur.get_ptr()), std::memory_order_relaxed);

        next = H(

          reinterpret_cast<typename H::pointer_type>(b),

          static_cast<typename H::tag_type>(cur.get_tag() + 1)

        );

      } while (!_free_head.compare_exchange_weak(

        cur, next,

        std::memory_order_release,   // publish next_free write to pop_free

        std::memory_order_relaxed

      ));

    }


    Block* pop_free() noexcept {

      H cur = _free_head.load(std::memory_order_acquire);

      while (cur.get_ptr()) {

        auto*  p  = reinterpret_cast<Block*>(cur.get_ptr());

        // relaxed on next_free: the acquire on _free_head (either the load

        // above or the acquire failure ordering below) synchronises-with the

        // release CAS in push_free, so the next_free store that preceded it

        // is already visible to this thread.

        Block* nx = p->next_free.load(std::memory_order_relaxed);

        H next(

          reinterpret_cast<typename H::pointer_type>(nx),

          static_cast<typename H::tag_type>(cur.get_tag() + 1)

        );

        if (_free_head.compare_exchange_weak(

              cur, next,

              std::memory_order_acquire,  // success: synchronise with push_free

              std::memory_order_acquire   // failure: fresh cur must also synchronise

        )) {                              //   before the next next_free read

          return p;

        }

      }

      return nullptr;

    }


    Block* allocate_from_backing() {

      return static_cast<Block*>(

        _backing.allocate(sizeof(Block), alignof(Block))

      );

    }


    void deallocate_to_backing(Block* b) {

      _backing.deallocate(b, sizeof(Block), alignof(Block));

    }

  };


  std::array<Shard, NumPools> _shards;


  // Returns the next shard index for this thread. The thread_local counter

  // is seeded once from the calling thread's ID hash, spreading different

  // threads across different starting shards with zero shared state.

  // Subsequent calls are a bare local increment — no atomic, no cache-line

  // traffic.

  //

  // If thread_local is broken (e.g. MSVC DLL with improper TLS), the counter

  // degrades to a single shared value and causes contention, but shard

  // selection remains correct.

  static size_t _next_shard() noexcept {

    thread_local size_t counter =

      std::hash<std::thread::id>{}(std::this_thread::get_id());

    return counter++ & (NumPools - 1);

  }


  public:


  ObjectPool() = default;


  ObjectPool(const ObjectPool&) = delete;


  ObjectPool& operator=(const ObjectPool&) = delete;


  ~ObjectPool() = default;


  template <typename... Args>


  [[nodiscard]] T* animate(Args&&... args) {

    auto  sid   = _next_shard();

    auto& shard = _shards[sid];


    Block* block = shard.pop_free();                      // hot path: lock-free

    if (!block) block = shard.allocate_from_backing();    // cold path: mutex, amortized


    block->pool_id = static_cast<uint16_t>(sid);

    return std::construct_at(block->object(), std::forward<Args>(args)...);

  }


  void recycle(T* obj) {

    if (!obj) return;

    auto* block = Block::from_object(obj);

    std::destroy_at(block->object());

    _shards[block->pool_id].push_free(block);             // hot path: lock-free

  }


  void release() {

    for (auto& shard : _shards) {

      // Release all backing chunks to upstream first — this covers both blocks

      // on the free stack and any that were never recycled, since the backing

      // pool owns memory at the chunk level, not per block.

      shard._backing.release();

      // Reset the free stack to null in O(1). Pointers it held are now

      // dangling (their backing chunks were just freed), so they must be

      // cleared before the allocator is used again.

      shard._free_head.store(H{}, std::memory_order_relaxed);

    }

  }


};


} // namespace tf

tf::ObjectPool::ObjectPool
ObjectPool(const ObjectPool &)=delete
disabled copy constructor

tf::ObjectPool::ObjectPool
ObjectPool()=default
constructs the allocator with 2^LogSize empty shards

tf::ObjectPool::animate
T * animate(Args &&... args)
constructs an object of type T in the pool and returns a pointer
Definition object_pool.hpp:579

tf::ObjectPool::~ObjectPool
~ObjectPool()=default
destroys the allocator and releases all backing memory to upstream

tf::ObjectPool::recycle
void recycle(T *obj)
destructs the object and returns its storage to the pool
Definition object_pool.hpp:621

tf::ObjectPool::operator=
ObjectPool & operator=(const ObjectPool &)=delete
disabled copy assignment operator

tf::ObjectPool::release
void release()
returns all recycled blocks and backing memory to the system allocator
Definition object_pool.hpp:670

tf
taskflow namespace
Definition small_vector.hpp:20

tf::TaggedHead128::get_ptr
pointer_type get_ptr() const noexcept
returns the stored block address
Definition object_pool.hpp:92

tf::TaggedHead128::TaggedHead128
TaggedHead128()=default
constructs a null, zero-tagged head

tf::TaggedHead128::ptr
pointer_type ptr
block address (reinterpret-cast to/from ObjectBlock*)
Definition object_pool.hpp:60

tf::TaggedHead128::tag
tag_type tag
ABA version counter; incremented on every push and pop.
Definition object_pool.hpp:65

tf::TaggedHead128::TaggedHead128
TaggedHead128(pointer_type p, tag_type t) noexcept
constructs a head with an explicit block address and version counter
Definition object_pool.hpp:82

tf::TaggedHead128::pointer_type
uintptr_t pointer_type
block address representation
Definition object_pool.hpp:50

tf::TaggedHead128::tag_type
uintptr_t tag_type
ABA version counter representation.
Definition object_pool.hpp:55

tf::TaggedHead128::get_tag
tag_type get_tag() const noexcept
returns the ABA version counter
Definition object_pool.hpp:102

tf::TaggedHead64::tag_type
uint16_t tag_type
ABA version counter representation.
Definition object_pool.hpp:165

tf::TaggedHead64::pointer_type
uintptr_t pointer_type
block address representation
Definition object_pool.hpp:160

tf::TaggedHead64::TaggedHead64
TaggedHead64()=default
constructs a null, zero-tagged head

tf::TaggedHead64::get_tag
tag_type get_tag() const noexcept
returns the 16-bit ABA version counter
Definition object_pool.hpp:224

tf::TaggedHead64::get_ptr
pointer_type get_ptr() const noexcept
returns the block address
Definition object_pool.hpp:214

tf::TaggedHead64::PTR_BITS
static constexpr int PTR_BITS
bits reserved for the block address
Definition object_pool.hpp:170

tf::TaggedHead64::TaggedHead64
TaggedHead64(pointer_type p, tag_type t) noexcept
constructs a head with an explicit block address and version counter
Definition object_pool.hpp:203

tf::TaggedHead64::bits
uintptr_t bits
packed word: high TAG_BITS bits hold the version tag, low PTR_BITS bits hold the address
Definition object_pool.hpp:186

tf::TaggedHead64::TAG_BITS
static constexpr int TAG_BITS
bits reserved for the version counter
Definition object_pool.hpp:175

tf::TaggedHead64::PTR_MASK
static constexpr pointer_type PTR_MASK
mask isolating the address field
Definition object_pool.hpp:180