Repository: bytemaster/fc_malloc
Branch: master
Commit: 7a56cf9eae24
Files: 19
Total size: 145.0 KB

Directory structure:
gitextract_3qg3aq5m/

├── .gitignore
├── CMakeLists.txt
├── README.md
├── bench.cpp
├── bit_index.cpp
├── bit_index.hpp
├── disruptor.hpp
├── fast_rand.cpp
├── fc_heap.hpp
├── fc_malloc.cpp
├── fc_malloc.h
├── fixed_pool.hpp
├── garbage_collector.hpp
├── hheap.cpp
├── ideas.txt
├── malloc2.cpp
├── malloc2.hpp
├── malloc3.cpp
└── mmap_alloc.hpp

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
# Compiled Object files
*.slo
*.lo
*.o

# Compiled Dynamic libraries
*.so
*.dylib

# Compiled Static libraries
*.lai
*.la
*.a


================================================
FILE: CMakeLists.txt
================================================
project( fc_malloc )
cmake_minimum_required( VERSION 2.8.8 )

IF( WIN32 )
	ADD_DEFINITIONS( -DBOOST_CONTEXT_NO_LIB )
	ADD_DEFINITIONS( -D_SCL_SECURE_NO_WARNINGS )
	ADD_DEFINITIONS( -D_WIN32_WINNT=0x0501 )
	ADD_DEFINITIONS( -D_CRT_SECURE_NO_WARNINGS )
ELSE(WIN32)
   SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x -Wall -Wno-unused-local-typedefs")
ENDIF(WIN32)


#add_executable( m3 malloc3.cpp )
add_executable( fheap bench.cpp )
target_link_libraries( fheap jemalloc )


================================================
FILE: README.md
================================================
fc_malloc
=========

Super Fast, Lock-Free, Wait-Free, CAS-free, thread-safe, memory allocator. 

Design
================== 

The key to developing fast multi-threaded allocators is eliminating 
lock-contention and false sharing.  Even simple atomic operations and
spin-locks can destroy the performance of an allocation system.  The real
challenge is that the heap is a multi-producer, multi-consumer resource 
where all threads need to read and write the common memory pool.

With fc_malloc I borrowed design principles from the LMAX disruptor and
assigned a dedicated thread for moving free blocks from all of the other
threads to the shared pool.  This makes all threads 'single producers' of
free blocks and therefore it is possible to have a lock-free, wait-free 
per-thread free list.   This also makes a single producer of 'free blocks'
which means that blocks can be aquired with a single-producer, multiple
consumer pattern.  

When there is a need for more memory and existing free-lists are not sufficent,
each thread maps its own range from the OS in 4 MB chunks. Allocating from
this 'cache miss' is not much slower than allocating stack space and
requires no contention.  Requests for larger than 4MB are allocated direclty
from the OS via mmap.  

Initial Benchmarks
==================

Testing memory allocation systems can be very difficulty and 'artificial tests',
are not always the most accurate predictors of real world performance, but I 
sought to develop a test that would stress the allocation system, particularlly
in multi-threaded environments.

The test I came up with creates 1 array per thread containing space for 500K 
allocations.  I then assigned each thread the job of randomly allocating 
empty slots in 1 array and randomly deallocating random slots in another array. 
The result is a 'random' set of producer-consumer threads.

Each allocation was 128 bytes.  Future versions of this benchmark will include
random sizes as well.  


| Benchmark                  | glibc       |  jemalloc   |   fc_malloc |
|----------------------------|-------------|-------------|-------------|
| Random Single Threaded     | 5.8s        |  4.5s       |  2.6s       |
| Random Multi Threaded (10) | 18.2s       |  13.6s      |  6.8s       |

Threads|fcalloc (s)|jemalloc(s)|fcram(MB)|jeram(mb)
---|---|---|---|---
1|4.8|9.7|97|84.3
2|5.9|14.8|120|104
3|6.5|16.8|145|123
4|7|18|167|142
5|8|18.9|185.5|160
6|8.7|20.3|214.3|189
7|9.9|22.9|238|212
8|11.4|25.2|257|224
9|12.5|26.1|278|244
10|12.9|27.9|308|270


As you can see from the results fc_malloc is over 2x faster than the
stock malloc even for single threaded cases.  For multi-threaded cases
it is 2.6x faster than the stock allocator.   The real test though is
the comparison to jemalloc which is generally considered one of the
highest performing alternative allocators available.  Here fc_malloc
is still 2x faster in the multi-threaded test.


================================================
FILE: bench.cpp
================================================
#include "fixed_pool.hpp"
#include <thread>
#include <string.h>
#include <stdio.h>
#include <iostream>
#include <sstream>
#define BENCH_SIZE ( (1024*16*2) )
#define ROUNDS 3000

/*  SEQUENTIAL BENCH
int main( int argc, char** argv )
{
  if( argc == 2 && argv[1][0] == 'S' )
  {
     printf( "fp_malloc\n");
     for( int i = 0; i < 50000000; ++i )
     {
        char* test = fp_malloc( 128 );
        assert( test != nullptr );
        test[0] = 1;
        free2( test );
     }
  }
  if( argc == 2 && argv[1][0] == 's' )
  {
     printf( "malloc\n");
     for( int i = 0; i < 50000000; ++i )
     {
        char* test = (char*)malloc( 128 );
        assert( test != nullptr );
        test[0] = 1;
        free( test );
     }
  }
  fprintf( stderr, "done\n");
 // sleep(5);
  return 0;
}
*/
/* RANDOM BENCH */
std::vector<int64_t*>  buffers[16];
void pc_bench_worker( int pro, int con, char* (*do_alloc)(int s), void (*do_free)(char*)  )
{
  int64_t total_alloc = 0;
  int64_t total_free = 0;
  int64_t total_block_alloc = 0;
  int64_t total_free_alloc = 0;

  for( int r = 0; r < ROUNDS; ++r )
  {
      for( size_t x = 0; x < BENCH_SIZE/4 ; ++x )
      {
         uint32_t p = rand() % buffers[pro].size();
         if( !buffers[pro][p] )
         {
           uint64_t si = 10000;//16 +rand()%(1024); //4000;//32 + rand() % (1<<16);
           total_alloc += si;
           int64_t* r = (int64_t*)do_alloc( si );
      //     block_header* bh = ((block_header*)r)-1;
     //      assert( bh->size() >= si + 8 );
      //     fprintf( stderr, "alloc: %p  %llu  of %llu  %u\n", r, si, bh->size(), bh->_size );
           assert( r != nullptr );
         //  assert( r[0] != 99 ); 
         
           memset( r, 0x00, si );
        //   r[0] = 99;
    //       total_block_alloc += r[1] = ((block_header*)r)[-1].size();
           buffers[pro][p] = r;
         }
      }
      for( size_t x = 0; x < BENCH_SIZE/4 ; ++x )
      {
         uint32_t p = rand() % buffers[con].size();
         assert( p < buffers[con].size() );
         assert( con < 16 );
         assert( con >= 0 );
         if( buffers[con][p] ) 
         { 
          // assert( buffers[con][p][0] == 99 ); 
          // buffers[con][p][0] = 0; 
         //  total_free += buffers[con][p][0];
         //  total_free_alloc += buffers[con][p][1];
           do_free((char*)buffers[con][p]);
           buffers[con][p] = nullptr;
         }
      }
      /*
      fprintf( stderr, "\n Total Alloc: %lld   Total Free: %lld   Net: %lld\n", total_alloc, total_free, (total_alloc-total_free) );
      fprintf( stderr, "\n Total Block Size: %lld   Total Free Blocks: %lld   Net: %lld\n\n", total_block_alloc, total_free_alloc, (total_block_alloc-total_free_alloc) );
      auto needed = (total_alloc-total_free);
      auto used = (total_block_alloc-total_free_alloc);
      auto wasted = used - needed;
      fprintf( stderr, "\n Total Waste: %lld    %f\n\n", wasted,  double(used)/double(needed) );
      */
  }
}


void pc_bench(int n, char* (*do_alloc)(int s), void (*do_free)(char*)  )
{
  for( int i = 0; i < 16; ++i )
  {
    buffers[i].resize( BENCH_SIZE );
    memset( buffers[i].data(), 0, 8 * BENCH_SIZE );
  }

  std::thread* a = nullptr;
  std::thread* b = nullptr;
  std::thread* c = nullptr;
  std::thread* d = nullptr;
  std::thread* e = nullptr;
  std::thread* f = nullptr;
  std::thread* g = nullptr;
  std::thread* h = nullptr;
  std::thread* i = nullptr;
  std::thread* j = nullptr;


 int s = 1;
  switch( n )
  {
     case 10:
     a = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 9:
      b = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 8:
      c = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 7:
      d = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 6:
     e = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 5:
     f = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 4:
      g = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 3:
      h = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 2:
      i = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 1:
      j = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
  }
  if(a)
  a->join();
  if(b)
  b->join();
  if(c)
  c->join();
  if(d)
  d->join();
  if(e)
  e->join();
  if(f)
  f->join();
  if(g)
  g->join();
  if(h)
  h->join();
  if(i)
  i->join();
  if(j)
  j->join();

}
void pc_bench_st(char* (*do_alloc)(int s), void (*do_free)(char*)  )
{
  for( int i = 0; i < 16; ++i )
  {
    buffers[i].resize( BENCH_SIZE );
    memset( buffers[i].data(), 0, 8 * BENCH_SIZE );
  }
  //int i = 0;
  pc_bench_worker( 1, 1, do_alloc, do_free );
}
//#include <tbb/scalable_allocator.h>

char* do_malloc(int s)
{ 
    return (char*)::malloc(s); 
//   return (char*)scalable_malloc(s);
}
void  do_malloc_free(char* c)
{ 
//    scalable_free(c);
   ::free(c); 
}

char* do_fc_malloc(int s)
{ 
  return (char*)fp_malloc(s);
//    return (char*)fc_malloc(s); 
//   return (char*)scalable_malloc(s);
}
void  do_fc_free(char* c)
{ 
  fp_free((void*)c);
//    scalable_free(c);
//   fc_free(c); 
}


int main( int argc, char** argv )
{
  /*
  char* a = static_heap.alloc32();
  char* b = static_heap.alloc32();
  char* c = static_heap.alloc32();
  fprintf( stderr, "%p %p %p\n", a, b, c );
  static_heap.free32(b);
  char* d = static_heap.alloc32();
  fprintf( stderr, "%p %p %p\n", d, b, c );
  return 0;
  */

  if( argc > 2 && argv[1][0] == 'm' )
  {
    std::cerr<<"malloc multi\n";
    pc_bench( atoi(argv[2]), do_malloc, do_malloc_free );
    return 0;
  }
  if( argc > 2 && argv[1][0] == 'M' )
  {
    std::cerr<<"hash malloc multi\n";
//    pc_bench( atoi(argv[2]), do_fp_malloc, do_fp_free );
    pc_bench( atoi(argv[2]), do_fc_malloc, do_fc_free );
    return 0;
  }
  if( argc > 1 && argv[1][0] == 's' )
  {
    std::cerr<<"malloc single\n";
    pc_bench_st( do_malloc, do_malloc_free );
    return 0;
  }
  if( argc > 1 && argv[1][0] == 'S' )
  {
    std::cerr<<"hash malloc single\n";
    pc_bench_st( do_fc_malloc, do_fc_free );
    return 0;
  }
  std::string line;
  std::getline( std::cin, line );
    std::vector<char*> data;
  while( !std::cin.eof() )
  {
    std::stringstream ss(line);
    std::string cmd;

    ss >> cmd;
    if( cmd == "a" ) // allocate new data
    {
      int64_t bytes;
      ss >> bytes;
      data.push_back( (char*)fp_malloc( bytes ) );
    }
    if( cmd == "f" ) // free data at index
    {
      int64_t idx;
      ss >> idx;
      fp_free( data[idx] );
      data.erase( data.begin() + idx );
    }
    if( cmd == "c" ) // print cache
    {
    //  thread_allocator::get().print_cache();
    }
    if( cmd == "p" ) // print heap
    {

    }
    if( cmd == "l" ) // list data
    {
       fprintf( stderr, "ID]  ptr  _size   _prev_size\n");
       fprintf( stderr, "-----------------------------\n");
       for( size_t i = 0; i < data.size(); ++i )
       {
     //     block_header* bh = reinterpret_cast<block_header*>(data[i]-8);
          fprintf( stderr, "%d]  %p \n", int(i), data[i]);

       }
    }
    std::getline( std::cin, line );
  }
  return 0;
}
#if 0
  printf( "alloc\n" );
  char* tmp = fp_malloc( 61 );
  usleep( 1000 );
  char* tmp2 = fp_malloc( 134 );
  usleep( 1000 );
  char* tmp4 = fp_malloc( 899 );
  printf( "a %p  b %p   c %p\n", tmp, tmp2, tmp4 );

  usleep( 1000 );

  printf( "free\n" );
  free2( tmp );
  usleep( 1000 );
  free2( tmp2 );
  usleep( 1000 );
  free2( tmp4 );

  usleep( 1000*1000 );

  printf( "alloc again\n" );
  char* tmp1 = fp_malloc( 61 );
  usleep( 1000 );
  char* tmp3 = fp_malloc( 134 );
  usleep( 1000 );
  char* tmp5 = fp_malloc( 899 );
  printf( "a %p  b %p   c %p\n", tmp1, tmp3, tmp5 );
  free2( tmp1 );
  free2( tmp3 );
  free2( tmp4 );

  usleep( 1000*1000 );

  return 0;
}
#endif


================================================
FILE: bit_index.cpp
================================================
#include "bit_index.hpp"
#include <stdio.h>

int main( int argc, char** argv )
{

    bit_index<64*64*64> b;
    b.set_all();
    for( int i = 0; i < 66; ++i )
    {
      b.clear(i);
      assert( !b.get(i) );
      fprintf( stderr, "\nI: %d\n", i );
      if( i >= 62 )
          b.dump();
      if( b.first_set_bit() != i+1 )
      {
          exit(1);
      }
    }
    for( int i = 0; i < 66; ++i )
    {
      assert( !b.get(i) );
    }
    assert( b.get(67) );

    return 0;
    fprintf( stderr, "pow64(1) = %d\n", pow64<1>::value );
    fprintf( stderr, "pow64(2) = %d\n", pow64<2>::value );
    fprintf( stderr, "log64(pow64(2)) = %d\n", log64<pow64<2>::value>::value );
    fprintf( stderr, "pow64(log64(64*64)) = %d\n", pow64<log64<64*64>::value>::value );
    fprintf( stderr, "pow64(log64(64*64*64)) = %d\n", pow64<log64<64*64*64>::value>::value );

    fprintf(stderr, "=========== 64 =============\n" ); 
    bit_index<64> _index;
    fprintf( stderr, "first set bit: %d\n", _index.first_set_bit() );
    assert( _index.first_set_bit() == 64 );
    _index.set( 34 );
    fprintf( stderr, "first set bit: %d\n", _index.first_set_bit() );
    assert( _index.get(34) );
    assert( _index.first_set_bit() == 34 );
    _index.clear(34);
    assert( !_index.get(34) );
    assert( _index.first_set_bit() == 64 );
    fprintf(stderr, "=========== 64*64 =============\n" ); 

    bit_index<64*64> _b62;
    _b62.set(1010);
    fprintf( stderr, "first set bit: %d\n", _b62.first_set_bit() );
    assert( _b62.first_set_bit() == 1010 );
    assert( _b62.get(1010) );
    assert( _b62.clear(1010) );
    assert( !_b62.get(1010) );

    fprintf(stderr, "=========== 64*64*64 =============\n" ); 
    bit_index<64*64*64> _b64;
    fprintf( stderr, "init first bit b64: %d\n", _b64.first_set_bit() );
    _b64.set( 660 );
    fprintf( stderr, "first set:   %d\n", _b64.first_set_bit() );
    assert( _b64.get(660) );
    _b64.clear(660);
    fprintf( stderr, "final first bit b64: %d\n", _b64.first_set_bit() );
    assert( !_b64.get(660) );

    bit_index<64*64*64> _b6464;
    fprintf( stderr, "SET BIT 66\n" );
    _b6464.set( 66 );
    fprintf( stderr, "first set 66?? :   %d\n", _b6464.first_set_bit() );
    fprintf( stderr, "size of %d 64*64*64\n", int(sizeof(_b64) ) );

    bit_index<64*64*64*64> _bbb;
    fprintf( stderr, "size of %d  64*64*64*64  \n\n\n", int(sizeof(_bbb) ) );
    _bbb.set(444);
    assert(_bbb.get(444) );
    {
    bit_index<64*64> _bbb;
    fprintf( stderr, "size of %d  64*64*64*64  \n\n\n", int(sizeof(_bbb) ) );
    _bbb.set(444);
    assert(_bbb.get(444) );
    }
    /*
    {
    bit_index<20*64*64> _bbb;
    fprintf( stderr, "size of %d  64*64*64*64  \n\n\n", int(sizeof(_bbb) ) );
    _bbb.set(444);
    assert(_bbb.get(444) );
    }
    */

    _index.set(3);
    _index.set(9);
    _index.set(27);

    auto itr = _index.at( _index.first_set_bit() );
    while( !itr.end() )
    {
       fprintf( stderr, "next bit %lld\n", itr.bit() );
       itr.next_set_bit();
    }

  {
    _b62.set(3);
    _b62.set(9);
    _b62.set(270);
    _b62.set(570);
    _b62.set(1270);

    auto itr = _b62.at( _b62.first_set_bit() );
    while( !itr.end() )
    {
       fprintf( stderr, "_b62 next bit %lld\n", itr.bit() );
       itr.next_set_bit();
    }
  }

    auto tmp = _bbb.begin();
    return 0;
}


================================================
FILE: bit_index.hpp
================================================
#pragma once
#include <stdint.h>
#include <assert.h>
#include <stdio.h>

#define LZERO(X)  (__builtin_clzll((X)) )

template<uint64_t>
class bit_index;

template<uint64_t x>
struct log64;
template<>
struct log64<64> { enum { value = 1 }; };
template<>
struct log64<0> { enum { value = 0 }; };
template<uint64_t x>
struct log64 { enum { value = 1 + log64<x/64>::value }; };


template<uint64_t x>
struct pow64;

template<>
struct pow64<0> { enum ev{ value = 1 }; };

template<uint64_t x>
struct pow64 { enum ev{ value = pow64<x-1>::value*64ll }; };


template<>
class bit_index<1>
{
  public:
    enum size_enum { index_size = 1 };
    void set( uint64_t pos = 0)
    {
      assert( pos == 0 );
      bit = 1;
    }
    bool get( uint32_t pos = 0)const { return bit; }
    uint64_t& get_bits(uint64_t ) { return bit; }

    bool clear( uint64_t pos  = 0)
    {
      assert( pos == 0 );
      return !(bit = 0);  
    }
    void clear_all() { clear(); }
    void set_all()   { set();   }

    uint64_t first_set_bit()const { return !bit; }
    uint64_t size()const          { return 1;    }


    struct iterator
    {
       public:
          uint64_t& get_bits()      { return _self->bit; }
          bool     end()const       { return _bit == 1;   }
          int64_t  bit()const       { return _bit; }
          void     set()            { _self->set(_bit); }
          bool     clear()          { return _self->clear(_bit); }
          bool     operator*()const { return _self->get(_bit); }

          iterator&  next_set_bit()
          {
              _bit = 1;
              return *this;
          }

          iterator( bit_index* s=nullptr, uint8_t b = 64 ):_self(s),_bit(b){}
       private:
          bit_index* _self;
          uint8_t     _bit;
    };

    iterator at( uint64_t p ) { return iterator(this, p); }

  private:
    uint64_t bit;
};

template<>
class bit_index<0> : public bit_index<1>{};

template<>
class bit_index<64>
{
    public:
      enum size_enum { index_size = 64 };
        bit_index(uint64_t s = 0):_bits(s){}

        /**
         *  option A: use conditional to check for 0 and return 64
         */
        uint64_t first_set_bit()const     { 
            return _bits == 0 ? 64 : LZERO(_bits); 
        }

        void dump( int depth )
        {
           for( int i = 0; i < depth; ++i )
              fprintf( stderr, "    " );
           fprintf( stderr, "%llx\n", _bits );
        }

        /**
         *  Option 2, compare + shift + lzr + compare + mult + or... this approach.. while
         *  the result of LZERO(0) is undefined, multiplying it by 0 is defined.
         *
         *  This code may be faster or slower depending upon this cache miss rate and
         *  the instruction level parallelism.  Benchmarks are required.
         */
        //uint64_t first_set_bit()const     { return (_bits == 0)<<6 | (LZERO(_bits) * (_bits!=0)); }
        bool     get( uint64_t pos )const { return _bits & (1ll<<(63-pos));   }
        void     set( uint64_t pos )    
        { 
            assert( pos < 64 );
            _bits |= (1ll<<(63-pos));       
        }
        bool     clear( uint64_t pos )  
        { 
//            fprintf( stderr, "bit_index<64>::clear %llu\n", pos );
            _bits &= ~(1ll<<(63-pos));      
            //fprintf( stderr, "bit_index<64> clear: %p   %llx\n", this, _bits );
            //fprintf( stderr, "bit_index<64>::clear %llu return %llu == 0\n", pos, _bits );
            return _bits == 0;
        }

        uint64_t size()const  { return 64;                          }
        uint64_t count()const { return __builtin_popcountll(_bits); }

        void set_all()   { _bits = -1; }
        void clear_all() { _bits = 0;  }

        uint64_t& get_bits( uint64_t bit )
        {
            assert( bit < 64 );
            return _bits;
        }

        struct iterator
        {
           public:
              uint64_t& get_bits()       { return _self->_bits; }
              bool     end()const       { return _bit == 64;   }
              int64_t  bit()const       { return _bit; }
              void     set()            { _self->set(_bit); }
              bool     clear()          { return _self->clear(_bit); }
              bool     operator*()const { return _self->get(_bit); }

              iterator&  next_set_bit()
              {
                  ++_bit;
                  if( end() ) return *this;
                  bit_index tmp( (_self->_bits << (_bit))>>(_bit) );  
                  _bit = tmp.first_set_bit();
                  return *this;
              }

              iterator( bit_index* s=nullptr, uint8_t b = 64 ):_self(s),_bit(b){}
           private:
              bit_index* _self;
              uint8_t     _bit;

        };

        iterator begin()      { return iterator(this,0); }
        iterator at(uint8_t i){ return iterator(this,i); }
        iterator end()        { return iterator(this,64); }
    protected:
        friend class iterator;
        uint64_t _bits;
};

/**
 *   A bit_index is a bitset optimized for searching for set bits.  The
 *   operations set and clear maintain higher-level indexes to optimize
 *   finding of set bits.
 *
 *   The fundamental size is 64 bit and the first set bit can be found
 *   with a single instruction. For indexes up-to 64*64 in size, the
 *   first set bit can be found with 2 clz + 1 compare + 1 mult + 1 add.
 *
 */
template<uint64_t Size>
class bit_index
{
    public:
      static_assert( Size >= 64, "smaller sizes not yet supported" );

      enum size_enum { 
         index_size        = Size,
         sub_index_size    = (Size+63) / 64,
         sub_index_count   = Size / sub_index_size 
      };
       static_assert( bit_index::sub_index_count > 0, "array with size 0 is too small" );
       static_assert( bit_index::sub_index_count <= 64, "array with size 64 is too big" );


      void dump( int depth = 0 )
      {
           _base_index.dump( depth + 1 );
           for( int i = 0; i < 3; ++i )
             _sub_index[i].dump( depth + 2 );

/**
           for( int i = 0; i < depth; ++i )
              fprintf( stderr, "    " );
           fprintf( stderr, "%llx\n", _bits );
           */
      }

      
      uint64_t size()const  { return index_size; }
      uint64_t first_set_bit()const
      {
          uint64_t base = _base_index.first_set_bit();
          if( base >= sub_index_count ) 
          {
              return Size;
          }
          auto subidx = _sub_index[base].first_set_bit();
          return base * sub_index_size + subidx; //_sub_index[base].first_set_bit(); 
      }
      bool get( uint64_t bit )const
      {
         assert( bit < Size );
         int64_t sub_idx     = (bit/sub_index_size);
         int64_t sub_idx_bit = (bit%sub_index_size);
         return _sub_index[sub_idx].get(  sub_idx_bit );
      }
      
      void set( uint64_t bit )
      {
         assert( bit < Size );
         int64_t sub_idx     = (bit/sub_index_size);
         int64_t sub_idx_bit = (bit%sub_index_size);
         _base_index.set(sub_idx);
         return _sub_index[sub_idx].set( sub_idx_bit );
      }
      
      bool clear( uint64_t bit )
      {
         assert( bit < Size );
         int64_t sub_idx     = (bit/sub_index_size);
         int64_t sub_idx_bit = (bit%sub_index_size);
         if( _sub_index[sub_idx].clear( sub_idx_bit ) )
            return _base_index.clear(sub_idx);
         return false;
      }
      
      void set_all()
      {
         _base_index.set_all();
         for( uint64_t i = 0; i < sub_index_count; ++i )
         {
           _sub_index[i].set_all();
         }
      }
      
      void clear_all()
      {
         _base_index.clear_all();
         for( uint64_t i = 0; i < sub_index_count; ++i )
         {
           _sub_index[i].clear_all();
         }
      }
      
      uint64_t count()const
      {
         uint64_t c = 0;
         for( uint64_t i = 0; i < sub_index_count; ++i )
         {
            c+=_sub_index[i].count();
         }
         return 0;
      }

      /**
       *  Returns the in64_t that contains bit
       */
      uint64_t& get_bits( uint64_t bit )
      {
         int64_t sub_idx      = (bit/sub_index_size);
         int64_t sub_idx_bit  = (bit%sub_index_size);
         return _sub_index[sub_idx].get_bits( sub_idx_bit );
      }


      struct iterator
      {
         public:
            uint64_t&  get_bits()         { return sub_itr.get_bits(); }
            bool       operator*()const   { return *sub_itr;           }
            bool       end()const { return sub_idx >= sub_index_count; }
            int64_t    bit()const { return pos; }
            void       set() 
            { 
                bit_idx->_base_index.set(sub_idx); 
                sub_itr.set();
            }
            bool       clear() 
            { 
                if( sub_itr.clear() )
                {
                  return bit_idx->_base_index.clear(sub_idx); 
                }
                return false;
            }

            /**
             *  Find the next bit after this one that is set..
             */
            iterator&  next_set_bit()
            {
                if( end() ) return *this;
                sub_itr.next_set_bit();
                if( sub_itr.end() )
                {
                   sub_idx = bit_idx->_base_index.at(sub_idx).next_set_bit().bit();
                   if( end() )
                   {
                      pos = Size;
                      return *this;
                   }
                   auto fb = bit_idx->_sub_index[sub_idx].first_set_bit();
                   sub_itr = bit_idx->_sub_index[sub_idx].at(fb);
                }
                pos = sub_idx * sub_index_size + sub_itr.bit();
                return *this;
            }

            /**
             *  Move to the next bit.
             */
            iterator&  operator++()
            { 
               assert( !end() );
               ++pos;
               ++sub_itr;
               if( sub_itr.end() )
               {
                  ++sub_idx;
                  if( !end() )
                  {
                     sub_itr = bit_idx->_sub_index[sub_idx].begin();
                  }
                  else pos = Size;
               }
               return *this;
            }
            iterator& operator++(int) { return this->operator++(); }
            iterator operator+(uint64_t delta) { return iterator( bit_idx, pos + delta ); }


            iterator( bit_index* self=nullptr, int64_t bit=Size)
            :bit_idx(self),pos(bit),sub_idx((bit/64)%64)
            {
               sub_itr = bit_idx->_sub_index[sub_idx].at(bit%sub_index_size);
            }
            iterator& operator=(const iterator& i )
            {
               bit_idx = i.bit_idx;
               pos = i.pos;
               sub_idx = i.sub_idx;
               sub_itr = i.sub_itr;
               return *this;
            }
         private:
            friend class bit_index;
            bit_index*                          bit_idx;
            int64_t                             pos;
            int8_t                              sub_idx;
            typename bit_index<sub_index_size>::iterator sub_itr;
      };

      iterator begin()            { return iterator( this, 0 );    }
      iterator end()              { return iterator( this, Size ); }
      iterator at(int64_t p)      { return iterator( this, p );    }
    protected:
      friend class iterator;
      bit_index<64>              _base_index;
      bit_index<sub_index_size>  _sub_index[sub_index_count];
};


================================================
FILE: disruptor.hpp
================================================
#pragma once
#include <memory>
#include <vector>
#include <stdint.h>
#include <unistd.h>
#include <atomic>
#include <assert.h>
#include <iostream>

namespace disruptor
{

class eof : public std::exception
{
   public:
    virtual const char* what()const noexcept { return "eof"; }
};


/**
 *  A sequence number must be padded to prevent false sharing and
 *  access to the sequence number must be protected by memory barriers.
 *
 *  In addition to tracking the sequence number, additional state associated
 *  with the sequence number is also made available.  No false sharing 
 *  should occur because all 'state' is only written by one thread. This
 *  extra state includes whether or not this sequence number is 'EOF' and
 *  whether or not any alerts have been published.
 */
class sequence
{
   public:
      sequence( int64_t v = 0 ):_sequence(v),_alert(0){}

      int64_t  lazy_read()const                   { return *((volatile int64_t*)&_sequence);}// .load( std::memory_order_acquire); }
      //volatile int64_t& lazy_write()              { return *((volatile int64_t*)&_sequence);}// .load( std::memory_order_acquire); }
      int64_t  aquire()const                      { return _sequence.load( std::memory_order_acquire); }
      int64_t  aquire_pending()const              { return _pending_sequence.load( std::memory_order_acquire); }
      void     lazy_store( int64_t value )        { _sequence.store(value, std::memory_order_relaxed); }
      void     store( int64_t value )             { _sequence.store(value, std::memory_order_release); }
      void     store_pending( int64_t value )     { _pending_sequence.store(value, std::memory_order_release); }
      void     set_eof()    { _alert = 1; }
      void     set_alert()  { _alert = -1; }
      bool     eof()const   { return _alert == 1; }
      bool     alert()const { return _alert != 0; }

      int64_t atomic_increment_and_get( uint64_t inc ) 
      { 
        return _sequence.fetch_add(inc, std::memory_order::memory_order_release) + inc;
      }

      int64_t increment_and_get( uint64_t inc ) 
      { 
          auto tmp = aquire() + inc;
          store( tmp );
          return tmp;
      }

   private:
      std::atomic<int64_t> _sequence;
      volatile int64_t     _alert;
      std::atomic<int64_t> _pending_sequence;
      int64_t              _post_pad[5];
};

class event_cursor;

/**
 *   A barrier will block until all cursors it is following are
 *   have moved past a given position.  The barrier uses a
 *   progressive backoff strategy of busy waiting for 1000 
 *   tries, yielding for 1000 tries, and the usleeping in 10 ms
 *   intervals.   
 *
 *   No wait conditions or locks are used because they would
 *   be 'intrusive' to publishers which must check to see whether
 *   or not they must 'notify'.  The progressive backoff approach
 *   uses little CPU and is a good compromise for most use cases.
 */
class barrier 
{
   public:
      void follows( const event_cursor& e );

      /**
       *  Used to check how much you can read/write without blocking.
       *
       *  @return the min position of every cusror this barrier follows.
       */
      int64_t get_min();

      /*
       *  This method will wait until all s in seq >= pos using a progressive
       *  backoff of busy wait, yield, and usleep(10*1000)
       *
       *  @return the minimum value of every dependency
       */
      int64_t wait_for( int64_t pos )const;
   private:
      mutable int64_t                   _last_min;
      std::vector<const event_cursor*>  _limit_seq;
};

/**
 *  Provides a automatic index into a ringbuffer with
 *  a power of 2 size.
 */
template<typename EventType, uint64_t Size = 1024>
class ring_buffer
{
    public:
      typedef EventType event_type;

      static_assert( ((Size != 0) && ((Size & (~Size + 1)) == Size)), 
                     "Ring buffer's must be a power of 2" );

      /** @return a read-only reference to the event at pos */
      const EventType& at( int64_t pos )const 
      {
        return _buffer[pos & (Size-1)];
      }

      /** @return a reference to the event at pos */
      EventType& at( int64_t pos )
      {
        return _buffer[pos & (Size-1)];
      }

      /** useful to check for contiguous ranges when EventType is
       *  POD and memcpy can be used.  OR if the buffer is being used
       *  by a socket dumping raw bytes in.  In which case memcpy
       *  would have to use to ranges instead of 1.
       */
      int64_t get_buffer_index( int64_t pos )const { return pos & (Size-1); }
      int64_t get_buffer_size()const               { return Size;           }

    private:
      EventType            _buffer[Size];
};

/**
 *  A cursor is used to track the location of a publisher / subscriber within
 *  the ring buffer.  Cursors track a range of entries that are waiting
 *  to be processed.  After a cursor is 'done' with an entry it can publish
 *  that fact.  
 *
 *  There are two types of cursors, read_cursors and write cursors.  read_cursors
 *  block when they need to 
 *
 *  Events between [begin,end) may be processed at will for readers.  When a reader
 *  is done they can 'publish' their progress which will move begin up to
 *  published position+1.   When begin == end, the cursor must call wait_for(end), 
 *  wait_for() will return a new 'end'.  
 *
 *  @section read_cursor_example Read Cursor Example
 *  @code
      auto source   = std::make_shared<ring_buffer<EventType,SIZE>>();
      auto dest     = std::make_shared<ring_buffer<EventType,SIZE>>();
      auto p        = std::make_shared<write_cursor>("write",SIZE);
      auto a        = std::make_shared<read_cursor>("a");

      a->follows(p);
      p->follows(a);

      auto pos      = a->begin();
      auto end      = a->end();
      while( true ) 
      {
         if( pos == end )
         {
             a->publish(pos-1);
             end = a->wait_for(end);
         }
         dest->at(pos) = source->at(pos);
         ++pos;
      }
 *  @endcode
 *
 *
 *  @section write_cursor_example Write Cursor Example
 *
 *  The following code would run in the publisher thread.  The
 *  publisher can write data without 'waiting' until it pos is
 *  greater than or equal to end.  The 'initial condition' of
 *  a publisher is with pos > end because the write cursor
 *  cannot 'be valid' for readers until after the first element
 *  is written.  
 *
    @code
        auto pos = p->begin();
        auto end = p->end();
        while( !done )
        {
           if( pos >= end )
           {  
              end = p->wait_for(end);
           }
           source->at( pos ) = i;
           p->publish(pos);
           ++pos;
        }
        // set eof to signal any followers to stop waiting after
        // they hit this position.
        p->set_eof();
    @endcode
 *
 *
 *
 */
class event_cursor
{
   public:
      event_cursor(int64_t b=-1):_name(""),_begin(b),_end(b){}
      event_cursor(const char* n, int64_t b=0):_name(n),_begin(b),_end(b){}

      /** this event processor will process every event
       *  upto, but not including s
       */
      void follows( const event_cursor& s ) { _barrier.follows(s); }

      /** returns one after cursor */
      int64_t begin()const { return _begin; }

      /** returns one after the last ready as of last call to wait_for() */
      int64_t end()const   { return _end;   }


      /** makes the event at p available to those following this cursor */
      void     publish( int64_t p )
      {
         check_alert();
         _begin = p + 1;
         _cursor.store( p );
      }
      void    lazy_publish( int64_t p )
      {
         _begin = p + 1;
         _cursor.lazy_store(p);
      }

      /** when the cusor hits the end of a stream, it can set the eof flag */
      void set_eof(){ _cursor.set_eof(); }

      /** If an error occurs while processing data the cursor can set an 
       *  alert that will be thrown whenever another cursor attempts to wait
       *  on this cursor.
       */
      void  set_alert( std::exception_ptr e ) 
      {   
          _alert = std::move(e); 
          _cursor.set_alert(); 
      }

      /** @return any alert set on this cursor */
      const std::exception_ptr& alert()const { return _alert; }


      /** If an alert has been set, throw! */
      inline void check_alert()const; 

      /** the last sequence number this processor has 
       *  completed.
       */
      const sequence& pos()const { return _cursor; }
      sequence&       pos(){ return _cursor; }

      /** used for debug messages */
      const char* name()const { return _name; }

    protected:
      /** last know available, min(_limit_seq) */
      const char*                   _name;
      int64_t                       _begin;
      int64_t                       _end;
      std::exception_ptr            _alert;
      barrier                       _barrier;
      sequence                      _cursor;
};

/**
 *  Tracks the read position in a buffer
 */
class read_cursor : public event_cursor
{
    public:
      read_cursor(int64_t p=0):event_cursor(p){}
      read_cursor(const char* n, int64_t p=0):event_cursor(n,p){}

      /** @return end() which is > pos */
      int64_t wait_for( int64_t pos )
      {
         try {
          return _end = _barrier.wait_for(pos) + 1;
         }
         catch ( const eof& ) { _cursor.set_eof(); throw; }
         catch ( ... ) { set_alert( std::current_exception() ); throw; }
      }

      /** find the current end without blocking */
      int64_t check_end()
      {
          return _end = _barrier.get_min() + 1;
      }
};

class shared_read_cursor : public read_cursor
{
    public:
      shared_read_cursor(int64_t p=0):read_cursor(p){}
      shared_read_cursor(const char* n, int64_t p=0):read_cursor(n,p){}

      /**
       *  This method will block until 'after_pos' is the 
       *  current pos, then it will set pos to 'pos'
       */
      void publish_after( int64_t pos, int64_t after_pos )
      {
         try {
            assert( pos > after_pos );
            while( _cursor.aquire() < after_pos )
            {
              // TODO:... this is a spinlock, ease CPU HERE... 
            } 
            // _barrier.wait_for(after_pos);
            publish( pos );
         }
         catch ( const eof& ) { _cursor.set_eof(); throw; }
         catch ( ... ) { set_alert( std::current_exception() ); throw; }
      }

      bool is_available( int64_t pos )
      {
         return pos <= _barrier.get_min(); 
      }

      int64_t claim(int64_t num) 
      {  
         auto pos = _claim_cursor.atomic_increment_and_get( num );
         return pos - num;
      }


      sequence      _claim_cursor;
};

typedef std::shared_ptr<read_cursor> read_cursor_ptr;

/**
 *  Tracks the write position in a buffer.
 *
 *  Write cursors need to know the size of the buffer
 *  in order to know how much space is available. 
 */
class write_cursor : public event_cursor
{
    public:
      /** @param s - the size of the ringbuffer, 
       *  required to do proper wrap detection 
       **/
      write_cursor(int64_t s)
      :_size(s),_size_m1(s-1)
      {
        _begin = 0;
        _end   = _size;
        _cursor.store(-1);
      }

      /**
       * @param n - name of the cursor for debug purposes
       * @param s - the size of the buffer.  
       */
      write_cursor(const char* n, int64_t s)
      :event_cursor(n),_size(s),_size_m1(s-1)
      {
         _begin = 0;
         _end   = _size;
         _cursor.store(-1);
      }

      /** waits for begin() to be valid and then
       *  returns it.  This is only safe for 
       *  single producers, multi-producers should 
       *  use claim(1) instead.
       */
      int64_t wait_next() 
      {
          wait_for( _begin );
          return _begin;
      }

      /**
       *   We need to wait until the available space in
       *   the ring buffer is  pos - cursor which means that
       *   all readers must be at least to pos - _size and
       *   that our new end is the min of the readers + _size
       */
      int64_t wait_for( int64_t pos )
      {
         try 
         {
           // throws exception on error, returns 'short' on eof
           return _end = _barrier.wait_for(  pos - _size ) + _size;  
         } 
         catch ( ... ) 
         { 
            set_alert( std::current_exception() ); throw; 
         }
      }
      int64_t check_end()
      {
          return _end = _barrier.get_min() + _size;
      }
    private:
      const int64_t _size;
      const int64_t _size_m1;
};

typedef std::shared_ptr<write_cursor> write_cursor_ptr;
/**
 *  When there are multiple writers this cursor can
 *  be used to reserve space in the write buffer 
 *  in an atomic manner.
 *
 *  @code
 *  auto start = cur->claim(slots);
 *  ... do your writes...
 *  cur->publish_after( start + slots, start -1 );
 *  @endcode
 *
 *  @todo
 *  An alternative implementation of this would involve
 *  having a sequence number for each thread.  A pre-allocated
 *  array of sequence pointers would be initialized to null.
 *  There would be a 'thread-specific' index into this array
 *  that would be allocated by an atomic inc the first time
 *  a new thread attempts to write.   Each sequence number
 *  would maintain two sequence numbers: published and
 *  pending.  
 *
 *  To determine the actual 'position' of the write
 *  cursor one would return the MIN( pending ) -1 or
 *  if no sequences are in the 'pending state' the
 *  MAX(published).  The pending state is any time
 *  the pending > published.
 *
 *  The consequence of this approach is that readers
 *  would have to perform more work to determine the end
 *  (reading from all thread positions), the benefit is
 *  that the producers would never have to 'wait' on
 *  each other.  
 *
 *  A variation on this would be to have a fixed 
 *  set of producers instead of a dynamic set.  This 
 *  fixed set would be configured at the start.
 *
 *  If there is low write-contention then this approach
 *  would probably be poor.
 */
class shared_write_cursor : public write_cursor 
{
   public:
      /** @param s - the size of the ringbuffer, 
       *  required to do proper wrap detection 
       **/
      shared_write_cursor(int64_t s)
      :write_cursor(s){}

      /**
       * @param n - name of the cursor for debug purposes
       * @param s - the size of the buffer.  
       */
      shared_write_cursor(const char* n, int64_t s)
      :write_cursor(n,s){}

      /** When there are multiple writers they cannot both
       *  assume the right to write to begin() to end(), 
       *  instead they must first claim some slots in an
       *  atomic manner.
       *
       *
       *  After pos().aquire() == claim( slots ) -1 the claimer
       *  is free to call publish up to start + slots -1 
       *
       *  @return the first slot the caller may write to.
       */   
      int64_t claim( size_t num_slots )
      {
           auto pos = _claim_cursor.atomic_increment_and_get( num_slots );
      //     std::cerr<<"  shared_write: publish "<<pos<<" after " << (pos-1) << " current pos: "<<_cursor.aquire()<<"\n";
           // make sure there is enough space to write
           wait_for( pos -1 ); // TODO: -1????
           return pos - num_slots;
      }

      /**
       *  This method will block until 'after_pos' is the 
       *  current pos, then it will set pos to 'pos'
       */
      void publish_after( int64_t pos, int64_t after_pos )
      {
         try {
            assert( pos > after_pos );
          //  std::cerr<<"publish "<<pos<<" after " << after_pos << " current pos: "<<_cursor.aquire()<<"\n";
            while( _cursor.aquire() != after_pos )
            {
              // TODO:... this is a spinlock, ease CPU HERE... 
              usleep(0);
            } 
            // _barrier.wait_for(after_pos);
            publish( pos );
         }
         catch ( const eof& ) { _cursor.set_eof(); throw; }
         catch ( ... ) { set_alert( std::current_exception() ); throw; }
      }
    private:
      sequence      _claim_cursor;
};


typedef std::shared_ptr<shared_write_cursor> shared_write_cursor_ptr;


inline void barrier::follows( const event_cursor& e )
{
    _limit_seq.push_back( &e );
}

inline int64_t barrier::get_min()
{
   int64_t min_pos = 0x7fffffffffffffff;
   for( auto itr = _limit_seq.begin(); itr != _limit_seq.end(); ++itr )
   {
      auto itr_pos = (*itr)->pos().aquire();
      if( itr_pos < min_pos ) min_pos = itr_pos;
   }
   return _last_min = min_pos;
}

inline int64_t barrier::wait_for( int64_t pos )const
{
   if( _last_min > pos ) 
      return _last_min;

   int64_t min_pos = 0x7fffffffffffffff;
   for( auto itr = _limit_seq.begin(); itr != _limit_seq.end(); ++itr )
   {
      int64_t itr_pos = 0;
      itr_pos = (*itr)->pos().aquire();
      // spin for a bit 
      for( int i = 0; itr_pos < pos && i < 10000; ++i  )
      {
         itr_pos = (*itr)->pos().aquire();
         if( (*itr)->pos().alert() ) break;
      }
      // yield for a while, queue slowing down
      for( int y = 0; itr_pos < pos && y < 10000; ++y )
      {
         usleep(0);
         itr_pos = (*itr)->pos().aquire();
         if( (*itr)->pos().alert() ) break;
      }

      // queue stalled, don't peg the CPU but don't wait
      // too long either...
      while( itr_pos < pos )
      {
         usleep( 10*1000 );
         itr_pos = (*itr)->pos().aquire();
         if( (*itr)->pos().alert() ) break;
      }

      if( (*itr)->pos().alert() )
      {
         (*itr)->check_alert();
         if( itr_pos > pos ) 
            return itr_pos -1; // process everything up to itr_pos
         throw eof();
      }


      if( itr_pos < min_pos ) 
          min_pos = itr_pos; 
   }
   //assert( min_pos != 0x7fffffffffffffff );
   return _last_min = min_pos;
}

inline void event_cursor::check_alert()const
{
    if( _alert != std::exception_ptr() ) std::rethrow_exception( _alert );
}


} // namespace disruptor


================================================
FILE: fast_rand.cpp
================================================
#include <stdint.h>
#include <memory.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <assert.h>
#include <unistd.h>
#ifdef _MSC_VER
#pragma intrinsic(__rdtsc)
uint64_t get_cc_time () {
    return __rdtsc();
}
#else
/* define this somewhere */
#ifdef __i386
__inline__ uint64_t rdtsc() {
     uint64_t x;
       __asm__ volatile ("rdtsc" : "=A" (x));
         return x;
}
#elif __amd64
__inline__ uint64_t rdtsc() {
     uint64_t a, d;
       __asm__ volatile ("rdtsc" : "=a" (a), "=d" (d));
         return a; //(d<<32) | a;
}
#endif


uint64_t get_cc_time () {
   return rdtsc();
}
#endif


// Some primes between 2^63 and 2^64 for various uses.
// source: CityHash
static const uint64_t k0 = 0xc3a5c85c97cb3127ULL;
static const uint64_t k1 = 0xb492b66fbe98f273ULL;
static const uint64_t k2 = 0x9ae16a3b2f90404fULL;

inline uint64_t ShiftMix(uint64_t val) { return val ^ (val >> 47); }

uint64_t fast_rand()
{
  int64_t now = rdtsc(); //get_cc_time();
  char*   s = (char*)&now; // note first 4 bits are 'LSB' on intel... 
                           // on bigendian machine we want to add 4
                           // LSB is most rand, the higher-order bits
                           // will not change much if at all between
                           // calls...

  const uint8_t a = s[0];
  const uint8_t b = s[4 >> 1];
  const uint8_t c = s[4 - 1];
  const uint32_t y = static_cast<uint32_t>(a) + (static_cast<uint32_t>(b) << 8);
  const uint32_t z = 4 + (static_cast<uint32_t>(c) << 2);
  return ShiftMix(y * k2 ^ z * k0) * k2;
}


================================================
FILE: fc_heap.hpp
================================================
#pragma once
#include "mmap_alloc.hpp"
#include <iostream>
#include <sstream>
#include <assert.h>
#include <string.h>
#include <vector>
#include <unordered_set>


#define CHECK_SIZE( x ) assert(((x) != 0) && !((x) & ((x) - 1)))
#define PAGE_SIZE (2*1024*1024)
#define LOG2(X) ((unsigned) (8*sizeof (unsigned long long) - __builtin_clzll((X)) - 1))
#define LZERO(X)  (__builtin_clzll((X)) )
#define NUM_BINS 32 // log2(PAGE_SIZE)

class block_header
{
  public: 
      block_header()
      :_prev_size(0),_size(-PAGE_SIZE),_flags(0)
      {
          //fprintf( stderr, "constructor... size: %d\n", _size );
          //memset( data(), 0, size() - 8 );
          assert( page_size() == PAGE_SIZE );
      }

      void* operator new (size_t s) { return malloc(PAGE_SIZE);/*mmap_alloc( PAGE_SIZE );*/ }
      void operator delete( void* p ) { free(p); /*mmap_free( p, PAGE_SIZE );*/ }

      void dump( const char* label )
      {
         fprintf( stderr, "%s ]  _prev_size: %d  _size: %d\n", label, _prev_size, _size);//, int(_flags) );
      }

      /** size of the block header including the header, data size is size()-8 */
      uint32_t      size()const { return abs(_size);                                            } 
      char*         data()      { return reinterpret_cast<char*>(((char*)this)+8);                       }

      block_header* next()const 
      { 
        return _size <= 0 ? nullptr : reinterpret_cast<block_header*>(((char*)this)+size());
      }

      block_header* prev()const      
      { 
        return _prev_size <= 0 ? nullptr : reinterpret_cast<block_header*>(((char*)this)-_prev_size); 
      }

      /** 
       *  creates a new block of size S at the end of this block.
       *
       *  @pre size is a power of 2
       *  @return a pointer to the new block, or null if no split was possible
       */ 
      block_header* split( uint32_t sz )
      {
         assert( sz >= 32 );
         assert( size() >= 32 );
         assert( sz <= (size() - 32) );
         assert( page_size() == PAGE_SIZE );
         assert( _size != 0xbad );
         CHECK_SIZE(sz);

         int32_t old_size      = _size;
         block_header* old_nxt = next(); 

         _size = size() - sz;
         assert( _size != 0 );
         block_header* nxt = next();
         assert( nxt != 0 );

         nxt->_prev_size   = _size;
         nxt->_size        = old_size < 0 ? -sz : sz;
         assert( _size != 0 );

         if( old_nxt ) old_nxt->_prev_size = nxt->_size;

         //memset( data(), 0, size()-8 );

         assert( size() + nxt->size() == uint32_t(abs(old_size)) );
         assert( nxt->next() == old_nxt );
         assert( nxt->prev() == this );
         assert( next() == nxt );
         assert( page_size() == PAGE_SIZE );
         assert( nxt->page_size() == PAGE_SIZE );
         assert( nxt != this );
         nxt->_flags = 0;
         return nxt;
      }

      /**
       *   @return the merged node, if any
       */
      block_header* merge_next()
      {
         assert( _size != 0xbad );
         block_header* cur_next = next();
         if( !cur_next ) return this;
         assert( cur_next->_size != 0xbad );
         assert( cur_next->size() > 0 );

       //  if( !cur_next->is_idle() ) return this;

         auto s = size();

         assert( _size > 0 );
         _size += cur_next->size();
         assert( _size != 0 );

         if( cur_next->_size > 0 ) 
         {
            block_header* new_next = next();
            new_next->_prev_size = size();
         }
         else
         {
            _size = -_size; // we are at the end.
            assert( _size != 0 );
         }
         assert( cur_next->_size = 0xbad );


        // memset( data(), 0, size()-8 );
         assert( size() > s );
         if( next() )
         {
          assert( size()/8 == next() - this );
          assert( next()->_prev_size == size() );
          assert( page_size() == PAGE_SIZE );
         }
         return this;
      }

      /**
       *   @return the merged node, or this.
       */
      block_header* merge_prev()
      {
         assert( page_size() == PAGE_SIZE );
         block_header* pre = prev();
         if( !pre ) return this;
         return prev()->merge_next();
      }

      block_header* head()
      {
         if( !prev() ) return this;
         return prev()->head();
      }
      block_header* tail()
      {
         if( !next() ) return this;
         return next()->tail();
      }

      size_t        page_size()
      {
         auto t = tail();
         auto h = head();
         return ((char*)t-(char*)h) + t->size();
      }

      struct queue_state // the block is serving as a linked-list node
      {
          block_header*    qnext;
          block_header*    qprev;
          block_header**   head;
          block_header**   tail;
      };

      enum flag_enum 
      { 
        queued = 1, 
        idle   = 2,
        active = 4
      };

      bool         is_idle()const { return _flags & idle;  }
      bool         is_active()const { return _flags & active; }
      bool         is_queued()const { return _flags & queued;  }

      void         set_active( bool s )
      {
        if( s ) _flags |= active;
        else    _flags &= ~active;
      }
      void         set_queued( bool s ) 
      {
        if( s ) _flags |= queued;
        else    _flags &= ~queued;

        // anytime we change state it should be reset..
        if( is_queued() )
        {
          as_queue().qnext = nullptr;
          as_queue().qprev = nullptr;
        }
      }

      /** removes this node from any queue it is in */
      void dequeue()
      {
         block_header* pre = as_queue().qprev; 
         block_header* nxt = as_queue().qnext; 
         if( pre ) pre->as_queue().qnext = nxt;
         if( nxt ) nxt->as_queue().qprev = pre;
         set_queued(false);
      }

      void         set_idle( bool s ) 
      {
        if( s ) _flags |= idle;
        else    _flags &= ~idle;
        assert( is_idle() == s );
      }
      queue_state& as_queue()  
      { 
    //    assert( is_queued() );
        return *reinterpret_cast<queue_state*>(data()); 
      }

//  private:
      int32_t   _prev_size; // size of previous header.
      int32_t   _size:24; // offset to next, negitive indicates tail, 8 MB max, it could be neg
      int32_t   _flags:8; // offset to next, negitive indicates tail
};
static_assert( sizeof(block_header) == 8, "Compiler is not packing data" );

typedef block_header* block_header_ptr;

struct block_stack
{
    public:
      block_stack():_head(nullptr){}

      void push( block_header* h )
      {
         h->as_queue().qnext = _head;
         if( _head ) _head->as_queue().qprev = h;
         _head = h;
         //_head.push_back(h);
      }
      void push_all( block_header* h )
      {
         assert( h->is_queued() );
         assert( _head == nullptr );
         _head = h;
      }

      /*
      bool pop( block_header* h )
      {
         if( _head == nullptr ) return null;
         return _head.erase(h) != 0;
      }
      */

      /** returns all blocks */
      block_header* pop_all()
      {
        block_header* h = _head;
        _head = nullptr;
        return h;
      }

      block_header* pop()
      {
         if( _head )
         {
            auto tmp = _head;
            _head = _head->as_queue().qnext;
            if( _head )
            _head->as_queue().qprev = nullptr;
            return tmp;
         }
         return nullptr;
         /*
         if( _head.size() == 0 ) return nullptr;
         auto f = _head.begin();
         auto h = *f;
         _head.erase(f);
         return h;
         */
      }

      block_header* head(){ return _head; }

      //int size() { return int(_head.size()); }
    
    private:
      //std::unordered_set<block_header*> _head;
      block_header* _head;
};

/**
 *  Single threaded heap implementation, foundation
 *  for multi-threaded version;
 */
class fc_heap 
{
   public:
      block_header* alloc( size_t s );
      void          free( block_header* h );

      fc_heap()
      {
        memset(_bins, 0, sizeof(_bins) ); 
        _free_32_data = mmap_alloc( PAGE_SIZE );
        _free_64_data = mmap_alloc( PAGE_SIZE );

        _free_32_data_end = _free_32_data + PAGE_SIZE;
        _free_64_data_end = _free_64_data + PAGE_SIZE;

        _free_32_scan_end = &_free_32_state[PAGE_SIZE/32/64];
        _free_64_scan_end = &_free_64_state[PAGE_SIZE/64/64];

        _free_32_scan_pos = _free_32_state;
        _free_64_scan_pos = _free_64_state;

        memset( _free_32_state, 0xff, sizeof(_free_32_state ) );
        memset( _free_64_state, 0xff, sizeof(_free_64_state ) );
      }
      ~fc_heap()
      {
        mmap_free( _free_64_data, PAGE_SIZE );
        mmap_free( _free_32_data, PAGE_SIZE );
      }

 //  private:
      char* alloc32()
      {
         uint32_t c = 0;
         while( 0 == *_free_32_scan_pos )
         {
            ++_free_32_scan_pos;
            if( _free_32_scan_pos == _free_32_scan_end )
            {
                _free_32_scan_pos = _free_32_state;
            }
            if( ++c == sizeof(_free_32_state)/sizeof(int64_t) )
            {
              return alloc64();
            }
         }
         int bit = LZERO(*_free_32_scan_pos);
         int offset = (_free_32_scan_pos - _free_32_state)*64;

         *_free_32_scan_pos ^= (1ll<<(63-bit)); // flip the bit
        // fprintf( stderr, "alloc offset: %d bit %d  pos %d\n", offset,bit,(offset+bit) );

         return _free_32_data + (offset+bit)*32;
      }

      char* alloc64()
      {
         uint32_t c = 0;
         while( 0 == *_free_64_scan_pos )
         {
            ++_free_64_scan_pos;
            if( _free_64_scan_pos == _free_64_scan_end )
            {
                _free_64_scan_pos = _free_64_state;
            }
            if( ++c == sizeof(_free_64_state)/sizeof(int64_t) )
            {
              return nullptr;
            }
         }
         int bit = LZERO(*_free_64_scan_pos);
         int offset = (_free_64_scan_pos - _free_64_state)*64;

         *_free_64_scan_pos ^= (1ll<<(63-bit)); // flip the bit

         return _free_64_data + (offset+bit)*64;
      }

      bool free32( char* p )
      {
         if( p >= _free_32_data &&
              _free_32_data_end > p )
         {
            uint32_t offset = (p - _free_32_data)/32;
            uint32_t bit = offset & (64-1);
            uint32_t idx = offset/64;
            
            _free_32_state[idx] ^= (1ll<<((63-bit))); 
            return true;
         }
         return false;
      }
      bool free64( char* p )
      {
         if( p >= _free_64_data &&
              _free_64_data_end > p )
         {
          uint32_t offset = (p - _free_64_data)/64;
          uint32_t bit = offset & (64-1);
          uint32_t idx = offset/64;

          _free_64_state[idx] ^= (1ll<<((63-bit))); 
          return true;
         }
         return false;
      }

      char*                       _free_32_data;
      char*                       _free_64_data;
      char*                       _free_32_data_end;
      char*                       _free_64_data_end;
      uint64_t*                   _free_32_scan_pos;
      uint64_t*                   _free_64_scan_pos;
      uint64_t*                   _free_32_scan_end;
      uint64_t*                   _free_64_scan_end;
      uint64_t                    _free_32_state[PAGE_SIZE/32/64];
      uint64_t                    _free_64_state[PAGE_SIZE/64/64];
      block_stack _bins[NUM_BINS]; // anything less than 1024 bytes
};


/**
 *  Return a block of size s or greater
 *  @pre size >= 32
 *  @pre size is power of 2
 */
block_header* fc_heap::alloc( size_t s )
{
   assert( s >= 32 );
   CHECK_SIZE( s ); // make sure it is a power of 2
   uint32_t min_bin = LOG2(s); // find the min bin for it.
   while( min_bin < 32 )
   {
      block_header* h = _bins[min_bin].pop();
      if( h )
      {
          assert( h->_size != 0 );
          assert( h->_size != 0xbad );
          assert( h->is_queued() );
          h->set_queued(false);
          if( h->size() - 32 < s  )
          {
            h->set_active(true);
            return h;
          }
          block_header* tail = h->split(s); 
          assert( h->_size != 0 );

          h->set_active(true);
          this->free(h);

          tail->set_active(true);
          return tail;
      }
      ++min_bin;
   }
   // mmap a new page
   block_header* h = new block_header();
   block_header* t = h->split(s);

   h->set_active(true);
   free(h);

   t->set_active(true);
   return t;
}

void fc_heap::free( block_header* h )
{
    assert( h != nullptr );
    assert( h->is_active() );
    assert( h->_size != 0 );
    assert( h->size() < PAGE_SIZE );

    auto pre = h->prev();
    auto nxt = h->next();

    if( nxt && !nxt->is_active() && nxt->is_queued() )
    {
        auto nxt_bin = LOG2(nxt->size());
        if( _bins[nxt_bin].head() == nxt )
        {
          _bins[nxt_bin].pop();
          nxt->set_queued(false);
        }
        else
        {
          nxt->dequeue();
        }
        h = h->merge_next();
    }

    if( pre && !pre->is_active() && pre->is_queued() )
    {
        auto pre_bin = LOG2(pre->size());
        if( _bins[pre_bin].head() == pre )
        {
          _bins[pre_bin].pop();
          pre->set_queued(false);
        }
        else
        {
          pre->dequeue();
        }
        h = pre->merge_next();
    }

    if( h->size() == PAGE_SIZE )
    {
      delete h;
      return;
    }

    h->set_active(false);
    h->set_queued(true );
    auto hbin = LOG2(h->size());
    _bins[hbin].push(h);
}

class thread_heap;

class garbage_thread
{
   public:
      static garbage_thread& get();
      uint64_t               avail( int bin );
      int64_t                claim( int bin, int64_t num );
      block_header*          get_claim( int bin, int64_t pos );

   protected:
      void   register_thread_heap( thread_heap* h );

      friend class thread_heap;
      static void run();
};


class thread_heap
{
  public:
    static thread_heap& get();

    block_header* allocate( size_t s )
    {
       if( s >= PAGE_SIZE )
       {
          // TODO: allocate special mmap region...
       }

       uint32_t min_bin = LOG2(s); // find the min bin for it.
       while( min_bin < NUM_BINS )
       {
          block_header* h = cache_alloc(min_bin, s);
          if( h ) return h;

          garbage_thread& gc = garbage_thread::get();
          if( auto av = gc.avail( min_bin ) )
          {
             int64_t claim_num = std::min<int64_t>(4,av);
             int64_t claim = gc.claim( min_bin, claim_num );
             int64_t end = claim + claim_num;
             while( claim < end )
             {
                block_header* h = gc.get_claim(min_bin,claim);
                if( h )
                {
                   cache(h);
                }
                ++claim;
             }
             h = cache_alloc(min_bin, s);
             if( h ) return h; // else... we actually didn't get our claim
          }
          ++min_bin;
       }
       block_header* h = new block_header();
       h->set_active(true);
       if( s <= PAGE_SIZE - 32 )
       {
          block_header* t = h->split(s);
          t->set_active(true);
          cache( h );
          return t;
       }
       return h;
    }

    block_header* cache_alloc( int bin, size_t s )
    {
       block_header* c = pop_cache(bin);
       if( c && (c->size() - 32) > s )
       {
           block_header* t = c->split(s);
           c->set_active(true);
           if( !cache( c ) )
           {
             this->free(c);
           }
           t->set_active(true);
           return t;
       }
       return nullptr;
    }

    bool          cache( block_header* h )
    {
       uint32_t b = LOG2( h->size() );
       if( _cache_size[b] < 4 ) 
       {
         h->set_queued(true);
         _cache[b].push(h);
         _cache_size[b]++;
         return true;
       }
       return false;
    }

    block_header* pop_cache( int bin )
    {
        block_header* h = _cache[bin].pop();
        if( h ) 
        { 
          _cache_size[bin]--; 
          h->set_queued(false);
          return h;
        }
        return nullptr;
    }

    void free( block_header* h )
    {
       h->set_queued(true);
       _gc_on_deck.push( h );
       if( !_gc_at_bat.head() )
         _gc_at_bat.push_all( _gc_on_deck.pop_all() );
    }
  private:
    thread_heap();

    friend garbage_thread;
    block_stack _gc_at_bat; // waiting for gc to empty
    block_stack _gc_on_deck; // caching until gc pickups at bat
    block_stack _cache[NUM_BINS];
    int16_t     _cache_size[NUM_BINS];

};


static fc_heap static_heap;

void* fc_malloc( size_t s )
{
  if( s <= 64 ) 
  {
    if( s <= 32 )
        return static_heap.alloc32();
    else
        return static_heap.alloc64();
  }
  // round up to nearest power of 2 > 32
  s += 8; // room for header.
  if( s < 32 ) s = 32; // min size
  s = (1<<(LOG2(s-1)+1)); // round up to nearest power of 2
  if( s < 24 ) s = 24;

  block_header* h = static_heap.alloc( s );
  assert( h->is_active() );
//  h->set_idle(false); 
//  assert( h->page_size() == PAGE_SIZE );
  return h->data();
}
void fc_free( void* f )
{
  if( static_heap.free32((char*)f) || static_heap.free64((char*)f) ) return; 
  block_header* bh = (block_header*)(((char*)f)-8);
 // fprintf( stderr, "fc_free(block: %p)\n", bh );
//  assert( bh->is_active() );
  //assert( bh->page_size() == PAGE_SIZE );
  static_heap.free(bh);
}


================================================
FILE: fc_malloc.cpp
================================================


/*
pool<24>   p24;
pool<58>   p58;
pool<120>  p120;
pool<248>  p248;
pool<504>  p504;
pool<1016> p1016;
pool<2040> p2040;
pool<4088> p4088;
*/


void* fc_malloc( size_t s )
{
#define TRY_POOL(I,X,S)   if( len < X ) return pool<I,X,S>::alloc(); 
    TRY_POOL(1,24,256);
    TRY_POOL(2,58,256);
    TRY_POOL(3,120,256);
    TRY_POOL(4,248,128);
    TRY_POOL(5,504,128);
    TRY_POOL(6,1016,128);
    TRY_POOL(7,2040,64);
    TRY_POOL(8,4088,64);
    TRY_POOL(9,8184,64);


    if( len < 64*1024 )
    {
    }
    if( len < 1024*1024 )
    {

    }
    else
    {
       uint64_t* m = malloc( s+8);
       *m = -1;
       return m+1;
    }
}

free( void* f )
{

}


================================================
FILE: fc_malloc.h
================================================
void* fc_malloc( size_t s );
free( void* f );


================================================
FILE: fixed_pool.hpp
================================================
#include <thread>
#include <atomic>
#include "mmap_alloc.hpp"
#include "bit_index.hpp"

#define GB (1024LL*1014LL*1024LL)
#define MB (1024LL*1024LL)
#define LOG2(X) ((unsigned) (8*sizeof (unsigned long long) - __builtin_clzll((X)) - 1))

class basic_page
{
  public:
     basic_page():_next_page(nullptr){}
    virtual ~basic_page(){}
    virtual void  release() = 0;
    virtual void* alloc() = 0;
    virtual void  free( void* ) = 0;
    virtual int   get_page_pos() = 0;
    virtual int   get_pool() = 0;
    virtual int64_t   get_available()const = 0;
    basic_page* _next_page;
 //   virtual void  item_size()const = 0;
};

typedef basic_page* basic_page_ptr;
class basic_pool
{
  public:
    virtual ~basic_pool(){}
    virtual basic_page* claim_page() = 0;
    virtual bool  gc_free(void*) = 0;
    virtual void gc_release( basic_page_ptr p ) = 0;
};
typedef basic_pool* basic_pool_ptr;


struct free_node
{
  free_node* next;
};

template<uint64_t ItemSize, uint64_t PageSize = 1*MB>
class fixed_pool : public basic_pool
{
  public:
    
    class page : public basic_page
    {
       public:
          page( int64_t claim_pos )
          {
              fprintf( stderr, "CLAIM POS %lld\n", claim_pos );
              _data = (char*)mmap_alloc( PageSize, (void*)((ItemSize << 32) + claim_pos * PageSize) );
              fprintf( stderr, " PAGE DATA: %p\n", _data );
              assert( (int64_t(_data) >> 32) == ItemSize );
              _next_data       = _data;
              _page_end        = _data + PageSize;
              _alloc_free      = nullptr;
              _gc_free_at_bat  = nullptr;
              _gc_free_on_deck = nullptr;
              _claim_pos = claim_pos;
              _alloc = 0;
              _free  = 0;
          }

          int _claim_pos;
          virtual int   get_page_pos() { return _claim_pos; }

          int get_pool() { return LOG2(ItemSize)-4; }

          ~page()
          {
            mmap_free( _data, PageSize );
          }

          void* alloc()
          {
              if( _gc_free_at_bat )
              {
                  fprintf( stderr, "%p   _gc_free_at_bat   page pos %d\n", this, _claim_pos );
                 free_node* gc = _gc_free_at_bat;
                 _gc_free_at_bat = nullptr;

                 while( gc )
                 {
                    free_node* n = gc->next;
                    gc->next = _alloc_free;
                    _alloc_free = gc;
                    gc = n;
                 }
              }
              if( _alloc_free )
              {
                 free_node* n = _alloc_free;
                 _alloc_free = n->next;
                 ++_alloc;
                 return n;
              }
              else if( _next_data != _page_end )
              {
                char* n = _next_data;
                _next_data += ItemSize;
                assert( n < _page_end );
                ++_alloc;
                return n;
              }
              else
              {
                fprintf( stderr, "_next_data == _page_end\n" );
                return nullptr;
              }
          }

          int64_t get_available()const
          {
              return PageSize/ItemSize - _alloc + _free; //_avail;
          }

          void free( void* c )
          {
              assert( c > _data && c < _page_end );
              free_node* n = (free_node*)c;
              n->next = _alloc_free;
              _alloc_free = n;
          }

          void gc_free( void* c )
          {
              //fprintf( stderr, "gc_free(%p)   _data %p   _end %p\n", c, _data, _page_end );
              assert( c >= _data && c < _page_end );
              free_node* n = (free_node*)c;
              n->next = _gc_free_on_deck;
              _gc_free_on_deck = n;

              if( !_gc_free_at_bat )
              {
                _gc_free_at_bat = _gc_free_on_deck;
                _gc_free_on_deck = nullptr;
              }
              ++_free;
          }

          bool is_claimed()const
          {
            return 0 != _claim.load(std::memory_order_relaxed);
          }

          bool claim()
          {
            return 0 == _claim.fetch_add(1);
          }

          void release() 
          {
            _claim.store(0);
          }
       protected:
          friend class thread_local_heap;
          friend class fixed_pool;

          int64_t             _alloc; // count managed by alloc thread
          int64_t             _free;  // count managed by the gc thread

          std::atomic<int>    _claim; // when 0 no one owns this page, first person to inc owns the page.
          
          free_node*          _alloc_free; // free list managed by alloc thread
                              
          free_node*          _gc_free_at_bat; 
          free_node*          _gc_free_on_deck;
          char*               _data;
          char*               _page_end;
          char*               _next_data;

    }; // class page


    /**
     *  Grab the next page with free space or allocate on
     *  if necessary.  This method may be called from any
     *  thread.
     */
    virtual basic_page* claim_page()
    {
        auto rp = _pending_read_pos.load( std::memory_order_relaxed );
        auto wp = _pending_write_pos.load( std::memory_order_relaxed );
        if( rp <= wp )
        {
          int64_t claim = _pending_read_pos.fetch_add(1);
          if( claim <= wp )
          {
             basic_page* p = _pending_pages[claim%32];
             _pending_pages[claim%32] = 0;
             if( p )
             {
              fprintf( stderr, "claiming pending page %p  \n", p);//, p->get_page_pos() );
              return p;
             }
             else
             {
              fprintf( stderr, "pending pages[claim] == null\n" );
             }
          }
        }
        
        int64_t claim = _next_page.fetch_add(1);
        page* p = new page(claim);
        fprintf( stderr, "alloc new page pending page %p  %d\n", p, p->get_page_pos() );
        //p->claim();
        _pages[claim] = p;
        return p;
    }

    virtual bool gc_free( void* v )
    {
        int64_t byte_pos      = (int64_t(v)<<32)>>32;
        int64_t page_num      = byte_pos/(PageSize);
        auto pg = _pages[page_num];
        fprintf( stderr, "page_num %lld  %p\n", page_num, v );
        assert( pg );
        if( pg  )
        {
          pg->gc_free(v);
          return true;
        }
        return false;
    }
    virtual void gc_release( basic_page_ptr p )
    {
       _free_pages.set( p->get_page_pos() );
       auto rp = _pending_read_pos.load(std::memory_order_relaxed);
       auto wp = _pending_write_pos.load(std::memory_order_relaxed);
       while( rp > wp - 31 )
       {
          ++wp;
          auto pos = wp%32;
          if( _pending_pages[pos] == nullptr )
          {
            int b = _free_pages.first_set_bit();
            if( _pages[b] && _pages[b]->get_available() )
            {
              _free_pages.clear(b);
              fprintf( stderr, "pending_pages[%lld] = %p\n", pos, _pages[b] );
              _pending_pages[ pos ] = _pages[b];
            }
            if( !_pages[b] ){ --wp; break; }
         }
       }
       _pending_write_pos.store(wp);
    }

    fixed_pool()
    :_pending_read_pos(0),_pending_write_pos(-1)
    {
       _free_pages.set_all();
       memset( _pages, 0, sizeof(_pages) );
       memset( _pending_pages, 0, sizeof(_pending_pages) );
    }

    typedef page*        page_ptr;
    std::atomic<int>     _next_page; // inc to allocate a new page.

    std::atomic<int64_t> _pending_read_pos;
    std::atomic<int64_t> _pending_write_pos;
    page_ptr             _pending_pages[32];

    // updated by gc thread... 'unclaimed pages' with free data.
    bit_index<64*64/*2*GB/PageSize*/>  _free_pages;
    page_ptr                  _pages[2*GB/PageSize];
};

class thread_local_heap;

class garbage_collector
{
  public:
    garbage_collector()
    :_done(false),
      _tlheaps(nullptr),
     _gc_thread(&garbage_collector::run){}
    ~garbage_collector()
    {
      _done.store(true);
      _gc_thread.join();
    }

    void register_thread_local_heap( thread_local_heap* t );

    static garbage_collector& get()
    {
      static garbage_collector gc;
      return gc;
    }

    static void run();

  private:
    std::atomic<bool>               _done;
    std::atomic<thread_local_heap*> _tlheaps;
    std::thread                     _gc_thread;
};

static basic_pool_ptr get_pool( int p )
{
  if( !(p >= 0 && p < 16 ) )
      fprintf( stderr, "%d", p );
  assert( (p >= 0 && p < 16 ) );
  static basic_pool_ptr _pools[16];
  static bool           _init = [&]()->bool{
     // allocate the pools for all size classes
     _pools[0]  = new fixed_pool<16>();
     _pools[1]  = new fixed_pool<32>();
     _pools[2]  = new fixed_pool<64>();
     _pools[3]  = new fixed_pool<128>();
     _pools[4]  = new fixed_pool<256>();
     _pools[5]  = new fixed_pool<512>();
     _pools[6]  = new fixed_pool<1024>();
     _pools[7]  = new fixed_pool<2*1024>();
     _pools[8]  = new fixed_pool<4*1024>();
     _pools[9]  = new fixed_pool<8*1024>();
     _pools[10] = new fixed_pool<16*1024>();
     _pools[11] = new fixed_pool<32*1024>();
     _pools[12] = new fixed_pool<64*1024>();
     _pools[13] = new fixed_pool<128*1024>();
     _pools[14] = new fixed_pool<256*1024>();
     _pools[15] = new fixed_pool<512*1024>();
     return true;
  }();
  (void)_init; // unused warning
  return _pools[p];
}


class thread_local_heap
{
   public:
      thread_local_heap()
      :_gc_at_bat(nullptr),
       _release_at_bat(nullptr),
       _gc_on_deck(nullptr),
       _release_on_deck(nullptr)
      {
        garbage_collector::get().register_thread_local_heap(this);
      }

      ~thread_local_heap()
      {
      }

      static thread_local_heap& get()
      {
        static __thread thread_local_heap* tlh = nullptr;
        if( !tlh ) tlh = new thread_local_heap();
        return *tlh;
      }

      void* alloc( size_t s )
      {
          int32_t pool  = LOG2(s-1) + 1 - 4;
   //       fprintf( stderr, "pool %d  for size %d\n", pool, int(s) );

          if( !_pages[pool] )
          {
              basic_page_ptr p = get_pool(pool)->claim_page();
              fprintf( stderr, "claim pool! %p\n", p );
              assert(p);
              _pages[pool] = p;
              auto r = p->alloc();
              assert(r);
              return r;
          }
          void* a = _pages[pool]->alloc();

          if( !a )  // the page must be full... release it and get a new one
          {
              fprintf( stderr, "release pool %d  %p\n", pool, _pages[pool] );
              basic_page_ptr p = get_pool(pool)->claim_page();
              assert( p );
              fprintf( stderr, "new page %p   avail: %lld\n", p, p->get_available() );

              _pages[pool]->_next_page = _release_on_deck;
              _release_on_deck = _pages[pool];

              if( _release_at_bat == nullptr )
              {
                _release_at_bat = _release_on_deck;
                _release_on_deck = nullptr;
              }
              _pages[pool] = p;
              assert(p);
              auto r = p->alloc();
              assert(r);
              return r;
          }
          assert( a );
          return a;
      }

      void  free( void* v )
      {
          assert( v != nullptr );

     //     fprintf( stderr, "free %p      tld: %p\n", v, this );
          
         // size_t   s     = int64_t(v)>>32;
         // int32_t  pool  = LOG2(s) - 4;

  //        fprintf( stderr, "Free size: %llu  on pool %d\n", s, pool );

          // try local free first.
         //  if( _pages[pool] && _pages[pool]->free(v) )
         //     return;

          free_node* fv = (free_node*)v;
          assert( fv != _gc_on_deck );

          fv->next = _gc_on_deck;
          _gc_on_deck = fv;

          if( _gc_at_bat == nullptr )
          {
            _gc_at_bat = _gc_on_deck;
            _gc_on_deck = nullptr;
          }
      }

   private:
      friend class garbage_collector;

      free_node*         _gc_at_bat;
      basic_page_ptr     _release_at_bat;
      uint64_t           _gc_pad[7];
      free_node*         _gc_on_deck;
      basic_page_ptr     _release_on_deck;
                         
      // current page for this thread...
      basic_page_ptr     _pages[32]; // sized every power of 2 up to 1MB
      thread_local_heap* _next;
};


void garbage_collector::register_thread_local_heap( thread_local_heap* t )
{
   auto* stale_head = _tlheaps.load(std::memory_order_relaxed);
   do { t->_next = stale_head;
   }while( !_tlheaps.compare_exchange_weak( stale_head, t, std::memory_order_release ) );
}

void garbage_collector::run()
{
  garbage_collector& gc = garbage_collector::get();
  while( true )
  {
    bool found_work = false;
    thread_local_heap* cur = gc._tlheaps.load( std::memory_order_relaxed );
    while( cur )
    {
        free_node* n = cur->_gc_at_bat;
        if( n )
        {
          cur->_gc_at_bat = nullptr;
          found_work = true;
        }
        while( n )
        {
          auto next = n->next;
          // TODO: free N
          int pool = LOG2( int64_t(n) >> 32 ) - 4;
       //   fprintf( stderr, "pool %d  gc_free %p\n", pool, n );
          get_pool( pool )->gc_free(n);
          //fprintf( stderr, "." );
          assert( n != next );
          n = next;
        }
        if( cur->_release_at_bat != nullptr )
        {
           basic_page_ptr p = cur->_release_at_bat;
           cur->_release_at_bat = nullptr;

           while( p )
           {
              p->release();
              int pool = p->get_pool(); //LOG2( int64_t(p) >> 32 ) - 4;
              get_pool( pool )->gc_release(p);
              p = p->_next_page;
           }
        }
        assert( cur != cur->_next );
        cur = cur->_next;
    }
    if( !found_work )
    {
       // TODO: replace with something better..
       ::usleep( 100 );
       if( gc._done.load() ) return;
    }
  }
}


void* fp_malloc( size_t s )
{
  return thread_local_heap::get().alloc(s);
}

void fp_free( void* v )
{
  thread_local_heap::get().free(v);
}


================================================
FILE: garbage_collector.hpp
================================================


================================================
FILE: hheap.cpp
================================================
#include <atomic>
#include <stdint.h>
#include <memory.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <assert.h>
#include <unistd.h>
#include <mutex>
#include <thread>
std::mutex print_mutex;
#include "disruptor.hpp"

using namespace disruptor;

#if 0
#define PRINT( ... )  \
{ std::unique_lock<std::mutex> _lock(print_mutex); \
    __VA_ARGS__ \
}
#define NEW_PRINT( ... ) \
{ std::unique_lock<std::mutex> _lock(print_mutex); \
  __VA_ARGS__ \
}
#define PAGE_FREE_PRINT( ... ) \
{ std::unique_lock<std::mutex> _lock(print_mutex); \
  __VA_ARGS__ \
}
#else
  #define PRINT(...)
  #define NEW_PRINT(...)
  #define PAGE_FREE_PRINT(...)
#endif


int64_t fast_rand();

struct slot_header
{ 
    int32_t page_id;     // used by free to find the page in the pool
    int16_t pool_id;     // used by free to find the pool
    uint8_t page_slot;   // the slot in the page in the pool
    uint8_t alignment;   // 8 if reserved, 0 if free... byte _data[alignment-1] = alignment.
};


template<uint32_t Size, uint32_t NumSlots>
struct page
{
  public:
    struct slot 
    { 
        int32_t page_id;     // used by free to find the page in the pool
        int16_t pool_id;     // used by free to find the pool
        uint8_t page_slot;   // the slot in the page in the pool
        uint8_t alignment;   // 8 if reserved, 0 if free... byte _data[alignment-1] = alignment.
        char    _data[Size]; // alignment helps us find the page_id/pool_id when allocated aligned objects.
    };

    page(int16_t page_id, int16_t pool_id)
    :_free_write_cursor(NumSlots)
    {
       _pool_id = pool_id;
       _page_id = page_id;
       _posted  = false;

     // ... 
       _free_write_cursor.follows( _free_read_cursor );
       _free_read_cursor.follows( _free_write_cursor );

       for( int i = 0; i < NumSlots; ++i )
       {
          slot& s = _slot[i];
          s.page_id = page_id;
          s.pool_id = pool_id;
          s.page_slot = i;
          s.alignment = 8; // free expects this 
          this->free(i); // increment the free write cursor
       }
       _release_free_pos = 0;

       assert( free_estimate() == NumSlots );
       assert( can_alloc() );
    }

    int32_t free_estimate() 
    { 
       if( _release_free_pos < 0 ) return 0;
       return _free_write_cursor.begin() - _release_free_pos;
    }

    bool can_alloc()
    {
       if( _free_read_cursor.begin() == _free_read_cursor.end() &&
           _free_read_cursor.begin() == _free_read_cursor.check_end() )
       { 
    //      std::cerr<<"    CAN ALLOC? page: "<<_page_id<<" free read cursor begin: "<<_free_read_cursor.begin()<<"   end: "<<_free_read_cursor.end()<<"\n";
          return false; 
       }
       return true;
    }

    char*  alloc(uint8_t align = 8)
    {
       if( !can_alloc() ) return nullptr; 
      
       auto    pos       = _free_read_cursor.begin();
       int64_t free_slot = _free_list.at(pos);
       _free_read_cursor.publish( pos );
        
     //  std::cerr<<"page: "<<_page_id<<" alloc slot: "<<int(free_slot)<<"  alignment: "<<int(align)<<"  free list pos: "<<pos<<"\n";
       assert( free_slot < NumSlots);
       assert( _slot[free_slot].alignment == 0 ); // make the spot as used and take its alignment

       //assert( _slot[free_slot].alignment == 0); // make the spot as used and take its alignment
       _slot[free_slot].alignment = align; // make the spot as used and take its alignment
       return _slot[free_slot]._data;
     //  uint8_t* rtn = (uint8_t*)_slot[free_slot]._data + align - 8; // TODO: adjust for alignment..
     //  rtn[-1] = align;
     //  return (char*)rtn;
    } 
    
    /** return the number of slots freed since this page was 'released' */
    uint64_t    free( uint8_t slot )
    {
    //   std::cerr<<"free slot: "<<int(slot)<<"  alignment: "<<int(_slot[slot].alignment)<<"\n";

       assert( slot < NumSlots );
       assert( _slot[slot].alignment >= 8 );
       assert( _slot[slot].pool_id == _pool_id );
       
       _slot[slot].alignment = 0; // last thing we do is set alignment.

       auto cl = _free_write_cursor.claim(1);
       _free_list.at(cl) = slot;
       //_free_write_cursor.publish_after( cl, cl - 1 );
       _free_write_cursor.publish( cl );//, cl - 1 );

       return free_estimate();
       return 0;
    }

    /** called to save the free cursor position so we can track how many
     *  slots have been freed since this thread gave up control 
     */
    void  release()
    {
        _posted = false;
        _free_claim.store(0,std::memory_order_relaxed);
        _release_free_pos = _free_write_cursor.begin();
    }

    void  claim()
    {
        _release_free_pos = -1;
    }
    bool  claim_free()
    {
       if( !_posted && 0 == _free_claim.fetch_add(1, std::memory_order_release ) )
       {
         return  _posted = true;
       }
       return false;
    }
    bool  is_posted_to_free_list(){ return _posted; }

   private:
    slot                                         _slot[NumSlots]; // actual data storage

    /** the position of the free_write_cursor at the time this page was 'released' 
     *  by the last allocator thread.
     **/
    int64_t                                      _release_free_pos;

    ring_buffer<uint16_t,2*NumSlots>             _free_list;
    shared_write_cursor                          _free_write_cursor;
    read_cursor                                  _free_read_cursor;
    uint32_t                                     _pool_id;
    uint32_t                                     _page_id;
    bool                                         _posted; 
    std::atomic<int>                             _free_claim;
};

/**
 *    A pool is a collection of 'pages' that threads can claim to use
 *    for allocation.  
 *
*/
template<uint16_t PoolId, uint32_t Size,uint32_t SlotsPerPage,uint32_t MaxPages=1024*32>
struct pool
{
   typedef page<Size,SlotsPerPage>  page_type;
   typedef page_type*               page_ptr;
   typedef typename page_type::slot slot_type;
   typedef slot_type*               slot_ptr;

   struct thread_local_data 
   {
      thread_local_data()
      :current_page_num(-1),
       current_page(nullptr){}

      int32_t    current_page_num;
      page_ptr   current_page;
   };

   ring_buffer<uint32_t,MaxPages>   _free_pages; // indexes into _alloc_pages
   shared_write_cursor              _free_page_write_cursor;
   shared_read_cursor               _free_page_read_cursor;

   ring_buffer<page_ptr,MaxPages>   _alloc_pages; // pages allocated (fixed index)
   shared_write_cursor              _page_alloc_cursor;
   const read_cursor                _page_alloc_begin; // used to prevent alloc_cursor from wrapping

   pool()
   :_free_page_write_cursor( MaxPages ),
    _free_page_read_cursor( MaxPages ),
    _page_alloc_cursor( MaxPages )
   {
      _free_page_write_cursor.follows( _free_page_read_cursor );
      _free_page_read_cursor.follows( _free_page_write_cursor );
     // _page_alloc_cursor.follows( _page_alloc_begin );
      //_page_alloc_begin.follows( _page_alloc_cursor ); // begin shouldn't move
   }

   static pool& instance() 
   { 
      static pool _p;
      return _p;
   }

   static thread_local_data*& local_pool()
   {
      static thread_local thread_local_data*  _current = nullptr;
      return _current;
   }

   thread_local_data&  get_local_pool()
   {
      thread_local_data*& cur = local_pool();
      if( cur == nullptr )
      {
         cur = new thread_local_data();
      }
      return *cur;
   }

   char* do_alloc( uint16_t align = 8 )
   {
       thread_local_data& tld = get_local_pool(); //get thread local data

       if( tld.current_page_num == -1 )  // we need to claim a page
       {
          claim_page(tld);
          assert( tld.current_page_num != -1 );
          assert( tld.current_page );
       }
       char* c = tld.current_page->alloc(align);

       while( !c )  // no space available, claim a new page
       { 
         claim_page(tld); 
         c = tld.current_page->alloc(align);
         if( !c ) 
         {
          std::cerr<<"!!?? NULL??\n";
         }
       }
       return c;
   }

   void do_free( char* c )
   {
      uint8_t* s = reinterpret_cast<uint8_t*>(c);
      assert( c != nullptr );
      assert( s[-1] == 8 ); 
      uint8_t* slot_pos = (uint8_t*)c-8;//s + s[-1]-16; // s-1 == alignment, default 8 byte

      slot_ptr sl = reinterpret_cast<slot_ptr>(slot_pos);
      assert( sl->pool_id == PoolId        ); 
      assert( sl->page_slot < SlotsPerPage );
      assert( sl->page_id < MaxPages       );
     
      auto p = _alloc_pages.at(sl->page_id);
      if( p->free(sl->page_slot) > SlotsPerPage/4 )
      {
          if( !p->claim_free() ) return; // do I get to post this.. or does someone else..
          // move page into free queue
          auto claim = _free_page_write_cursor.claim(1);
          _free_pages.at(claim) = sl->page_id;


          PAGE_FREE_PRINT(std::cerr<<"PAGE AVAILABLE: "<<sl->page_id<<"\n";
          std::cerr<<"    sl->pool_id: "<<int(sl->pool_id)<<"  slot: "<<int(sl->page_slot)<<"  id: "<<int(sl->page_id)<<" SlotsPerPage: "<<SlotsPerPage<<"   available_slots: "<<p->free_estimate()<<" \n";
          std::cerr<<"    free_page_write claim: "<<claim<<"\n";
          )

          _free_page_write_cursor.publish_after( claim, claim -1 );
      }
   }

   void claim_page( thread_local_data& tld )
   {
       if( tld.current_page ) tld.current_page->release(); 

       auto read_claim =  _free_page_read_cursor.claim(1);
       if(  !_free_page_read_cursor.is_available( read_claim ) )
       { 
          NEW_PRINT(std::cerr<<"NEW PAGE:    free_read_claim_idx: "<<read_claim<<"\n";)
          auto free_write_idx = _free_page_write_cursor.claim(1); // claim a place to store the 'free' allocated page
          NEW_PRINT(std::cerr<<"             free_write_idx: "<<free_write_idx<<"\n";)

       // the read position is after the next write position... allocate
          // allocate and publish page_idx ... to both free page cursors
          auto alloc_idx  = _page_alloc_cursor.claim(1); // claim a place to allocate.. 
          NEW_PRINT(std::cerr<<"             alloc_write_idx: "<<alloc_idx<<" READ  "<<read_claim<<"\n";)

          _alloc_pages.at(alloc_idx) = new page_type( alloc_idx, PoolId ); // TODO: replace with mmap
          _page_alloc_cursor.publish_after( alloc_idx, alloc_idx-1 ); // publish the allocated buffer
          NEW_PRINT(std::cerr<<"                 alloc published: "<<alloc_idx<<"  READ "<<read_claim<<" \n";)

          _free_pages.at(free_write_idx) = alloc_idx;           
          //_free_page_write_cursor.publish_after(free_write_idx,free_write_idx-1); // publish the new 'free' buffer
          _free_page_write_cursor.publish(free_write_idx);//,free_write_idx-1); // publish the new 'free' buffer
          NEW_PRINT(std::cerr<<"                 free write idx published: "<<free_write_idx<<"  value: "<<_free_pages.at(free_write_idx)<<"\n";)

          NEW_PRINT( std::cerr<<"                READ CLAIM: "<<read_claim<<"\n";);
         // _free_page_read_cursor.wait_for( read_claim );
          auto ridx = _free_pages.at(read_claim);
          NEW_PRINT( std::cerr<<"                 free_page read publish: "<<read_claim<<"  value: "<<ridx<<"\n";)
          //_free_page_read_cursor.publish_after(read_claim,read_claim-1);
          _free_page_read_cursor.publish(read_claim);//,read_claim-1);

          tld.current_page_num = ridx;
          tld.current_page     = _alloc_pages.at(tld.current_page_num);
       }
       else
       {
        NEW_PRINT( std::cerr<<"RECLAIM PAGE:  free_read_claim_idx: "<<read_claim<<"  page: "<<_free_pages.at(read_claim)<<"\n";)
          tld.current_page_num = _free_pages.at(read_claim);
          //_free_page_read_cursor.publish_after(read_claim,read_claim-1);
          _free_page_read_cursor.publish( read_claim );
          tld.current_page     = _alloc_pages.at(tld.current_page_num);
        NEW_PRINT( std::cerr<<"               published free_read_claim_idx: "<<read_claim<<"\n"; )
        NEW_PRINT( std::cerr<<"               available: "<< tld.current_page->free_estimate()<<"\n"; )
       }
       tld.current_page->claim();
   }
   
   static void   free( char* c )             { instance().do_free(c);             };
   static char*  alloc( uint16_t align = 8 ) { return instance().do_alloc(align); };
};


#define BENCH_SIZE ( (1024*256) )
#define ROUNDS 100 
//#define BENCH_SIZE ( (512) )
//#define ROUNDS 5 


#include <thread>
void malloc_bench( int tid )
{
  std::vector<char*> a(BENCH_SIZE);
  memset( a.data(), 0, a.size() * sizeof(char*));
  for( int x = 0; x < ROUNDS; ++x )
  {
    for( int i = 0; i < BENCH_SIZE; ++i )
    {
      int pos = rand() & 1;
      if( a[i] && pos )
      {
          free(a[i]); 
          a[i]=0;
      }
      else if( !a[i] && pos )
      {
          a[i] = (char*)malloc(64);
      }
    }
  }
}
void bench(int tid)
{
  std::vector<char*> a(BENCH_SIZE);
  memset( a.data(), 0, a.size() * sizeof(char*));
  for( int x = 0; x < ROUNDS; ++x )
  {
    for( int i = 0; i < BENCH_SIZE; ++i )
    {
      int pos = rand() & 1;
      if( a[i] && pos )
      {
          pool<1,64,256>::free(a[i]); 
          a[i] = 0;//free(a[i]); 
      }
      else if( !a[i] && pos )
      {
         a[i] = pool<1,64,256>::alloc();
      }
    }
  }
}

std::vector<char*>  buffers[16];


void pc_bench_worker( int pro, int con, char* (*do_alloc)(int s), void (*do_free)(char*)  )
{
  for( int r = 0; r < ROUNDS; ++r )
  {
      for( int x = 0; x < buffers[pro].size()/2 ; ++x )
      {
         int p = fast_rand() % buffers[pro].size();
         if( !buffers[pro][p] )
         {
           auto si = 60; //fast_rand() % (1<<15);
           auto r = do_alloc( si );

           slot_header* sh = (slot_header*)(r-8);// TODO: handle alignment
           //assert( sh->alignment == 8 );
           //assert( sh->pool_id > 3 );

           if( r == nullptr )
           {
            std::cerr<<"size: "<<si<<"  returned null\n";
           }
           assert( r != nullptr );
           assert( r[0] != 99 ); 
           r[0] = 99; 
           buffers[pro][p] = r;
         }
      }
      for( int x = 0; x < buffers[con].size()/2 ; ++x )
      {
         int p = fast_rand() % buffers[con].size();
         if( buffers[con][p] ) 
         { 
           //assert( buffers[con][p][0] == 99 ); 
           buffers[con][p][0] = 0; 
           do_free(buffers[con][p]);
           buffers[con][p] = 0;
         }
      }
  }
}
#if 0
void pc_bench_worker( int pro, int con, char* (*do_alloc)(), void (*do_free)(char*)  )
{
  for( int r = 0; r < ROUNDS; ++r )
  {
     // produce some
     for( int i = 0; i < buffers[pro].size(); ++i )
     {
        // don't wrap...
      //  while( buffers[pro][i] ) usleep(0);
        buffers[pro][i] = do_alloc();
     }
     for( int i = 0; i < BENCH_SIZE*2; ++i )
     {
        rand() % buffers[con].size()
     }

     usleep( 100 );
     for( int i = 0; i < buffers[pro].size(); ++i )
     {
     //   while( !buffers[con][i] ) usleep(0);
        if( buffers[con][i] )
        {
           do_free(buffers[con][i]);
           buffers[con][i] = 0;
        }
     }
  }
}
#endif

void pc_bench(char* (*do_alloc)(int s), void (*do_free)(char*)  )
{
  for( int i = 0; i < 16; ++i )
  {
    buffers[i].resize( BENCH_SIZE );
    memset( buffers[i].data(), 0, 8 * BENCH_SIZE );
  }
  std::thread a( [=](){ pc_bench_worker( 1, 2, do_alloc, do_free ); } );
  std::thread b( [=](){ pc_bench_worker( 2, 3, do_alloc, do_free ); } );
  std::thread c( [=](){ pc_bench_worker( 3, 4, do_alloc, do_free ); } );
  std::thread d( [=](){ pc_bench_worker( 4, 5, do_alloc, do_free ); } );
  std::thread e( [=](){ pc_bench_worker( 5, 6, do_alloc, do_free ); } );
  std::thread f( [=](){ pc_bench_worker( 6, 7, do_alloc, do_free ); } );
  std::thread g( [=](){ pc_bench_worker( 7, 8, do_alloc, do_free ); } );
  std::thread h( [=](){ pc_bench_worker( 8, 9, do_alloc, do_free ); } );
  std::thread i( [=](){ pc_bench_worker( 9, 10, do_alloc, do_free ); } );
  std::thread j( [=](){ pc_bench_worker( 10, 1, do_alloc, do_free ); } );

  a.join();
  b.join();
  c.join();
  d.join();
  e.join();
  f.join();
  g.join();
  h.join();
  i.join();
  j.join();
}
void pc_bench_st(char* (*do_alloc)(int s), void (*do_free)(char*)  )
{
  for( int i = 0; i < 16; ++i )
  {
    buffers[i].resize( BENCH_SIZE );
    memset( buffers[i].data(), 0, 8 * BENCH_SIZE );
  }
  int i = 0;
  std::thread a( [=](){ pc_bench_worker( 1, 1, do_alloc, do_free ); } );
  /*
  std::thread b( [=](){ pc_bench_worker( 2, 2, do_alloc, do_free ); } );
  std::thread c( [=](){ pc_bench_worker( 3, 3, do_alloc, do_free ); } );
  std::thread d( [=](){ pc_bench_worker( 4, 4, do_alloc, do_free ); } );
  std::thread e( [=](){ pc_bench_worker( 5, 5, do_alloc, do_free ); } );
  std::thread f( [=](){ pc_bench_worker( 6, 6, do_alloc, do_free ); } );
  std::thread g( [=](){ pc_bench_worker( 7, 7, do_alloc, do_free ); } );
  std::thread h( [=](){ pc_bench_worker( 8, 8, do_alloc, do_free ); } );
  */

  a.join();
  /*
  b.join();
  c.join();
  d.join();
  e.join();
  f.join();
  g.join();
  h.join();
  */
}

char* do_malloc(int s){ return (char*)malloc(s); }
void  do_malloc_free(char* c){ free(c); }
char* do_hash_malloc(int s)
{ 
    #define LOG2(X) ((unsigned) (8*sizeof (unsigned long long) - __builtin_clzll((X)) - 1))
    switch( LOG2(s)+1 )
    {
       case 64:
            assert("!dont malloc yet.." );
            return (char*)malloc(s);
       case 16:
            return pool<16,1<<16,8>::alloc(); 
       case 15:
            return pool<15,1<<15,16>::alloc(); 
       case 14:
            return pool<14,1<<14,32>::alloc(); 
       case 13:
            return pool<13,1<<13,64>::alloc(); 
       case 12:
            return pool<12,1<<12,64>::alloc(); 
       case 11:
            return pool<11,1<<11,64>::alloc(); 
       case 10:
            return pool<10,1<<10,128>::alloc(); 
       case 9:
            return pool<9,1<<9,128>::alloc(); 
       case 8:
            return pool<8,1<<8,128>::alloc(); 
       case 7:
            return pool<7,1<<7,256>::alloc(); 
       case 6:
            return pool<6,1<<6,256>::alloc(); 
       case 5:
       default:
            return pool<5,1<<5,256>::alloc(); 
    }
    assert( !"we shoudln't get here!" );
}


void  do_hash_free(char* c)
{ 
    assert( c != nullptr );
    uint8_t a = *(c-1); // alignment
    slot_header* sh = (slot_header*)(c-8);// TODO: handle alignment
    assert( a == 8 );
    if( !(sh->pool_id >=5 && sh->pool_id <= 16 ) )
    {
      PRINT( std::cerr<< "ERROR: pool_id: "<<sh->pool_id<<"\n"; 
          std::cerr.flush();
          assert( sh->pool_id >=5 && sh->pool_id <= 16 );
      );
    }
    switch( sh->pool_id )
    {
       case 16:
            pool<16,1<<16,8>::free(c); 
            return;
       case 15:
            pool<15,1<<15,16>::free(c); 
            return;
       case 14:
            pool<14,1<<14,32>::free(c); 
            return;
       case 13:
            pool<13,1<<13,64>::free(c); 
            return;
       case 12:
            pool<12,1<<12,64>::free(c); 
            return;
       case 11:
            pool<11,1<<11,64>::free(c); 
            return;
       case 10:
            pool<10,1<<10,128>::free(c); 
            return;
       case 9:
            pool<9,1<<9,128>::free(c); 
            return;
       case 8:
            pool<8,1<<8,128>::free(c); 
            return;
       case 7:
            pool<7,1<<7,256>::free(c); 
            return;
       case 6:
            pool<6,1<<6,256>::free(c); 
            return;
       case 5:
       default:
            pool<5,1<<5,256>::free(c); 
            return;
    }
    assert( !"we shoudln't get here!" );
}


int main( int argc, char** argv )
{
  if( argc > 1 && argv[1][0] == 'm' )
  {
    std::cerr<<"malloc multi\n";
    pc_bench( do_malloc, do_malloc_free );
  }
  if( argc > 1 && argv[1][0] == 'M' )
  {
    std::cerr<<"hash malloc multi\n";
    pc_bench( do_hash_malloc, do_hash_free );
  }
  if( argc > 1 && argv[1][0] == 's' )
  {
    std::cerr<<"malloc single\n";
    pc_bench_st( do_malloc, do_malloc_free );
  }
  if( argc > 1 && argv[1][0] == 'S' )
  {
    std::cerr<<"hash malloc single\n";
    pc_bench_st( do_hash_malloc, do_hash_free );
  }
  return 0;
}


================================================
FILE: ideas.txt
================================================
Global Ready Queue per Size Class of 256 each.... combined with 16 per thread per size assuming 16 threads... class means that
in the 'idle state' we have

Size allocations are not 'random', but usually fall into predictable patterns.
The 'ideal' buffer size is one that is never full and never empty... if it ever empties then the next time
you fill it you should fill it 'fuller' than the last time... and attempt to keep it there.  

If the buffer is 'full' when you check then you can start reclaiming data from that buffer.


GC Thread:

For each size class... maintain a hash 'set' of free chunks in that set.

When a new chunk comes in, look for its 'prev' in its hash set, if found remove it, merge the two... then look for the 'next' if found merge the two...
then store the result back in the new hash table after checking to see if the queue for that size class is waiting for data.


GC Thread Loop:
{
  foreach thread_garabage_bin
     pull all chunks, insert them into merge set, then merge them if possible

  foreach size class
     refill the queue
        if queue was empty... grow the queue by 4
        if queue was full... increment full count
            if full count > N then reclaim 25% and reset full count.
        pull chunks from proper size heap... 
          - if not enough are available then divide up chunks from the 
             next size up.
        if a chunk reaches the 'page size' and the 'page size' block queue is
        empty then we can release it back to the OS.

   when there is no merging / reclaiming to do... set a flag and
   wait on a mutex... next person to call free will wake me up when
   they see the flag set.

   When choosing empty chunks to place in the queue... pick the chunk from the
   block with the 'oldest' creation time.  This optimization requires more
   expensive 'sorting', we can skip this step when ever there is demand for
   'all chunks' of a particular class size, but when there is only demand for
   a fraction of the available chunks, then, because we are scanning the
   hash table linerally... 

   Each node in the hash table points to prev/next pairs... when a hash is
   'inserted' its memory location is based on its hash value, but its prev/next
   is based upon order of arival.   Thus you can quickly find a node, then
   extract like a double-linked-list.  
}

Merge Cost:
  2 hash lookups + 1 hash set and perhaps 2 hash clears
  3 total calls to city hash...

The 'free queue' can be a linked list of the 'freed chunks'.
  Each thread has its 'ready bin' which it will set 'if null', and
  its pending bin which it will fill if the ready bin is not null.
  the memory space in the block is converted into a 'next' pointer.
  no large per-thread 'free queues'.

  
   queues will adjust in length until they can handle the 'burst' processing rate
   of the GC thread.

  When the GC thread cannot keep the queues full, then threads fall back on
  directly allocating their own chunks.
  

Overhead per block.. 8 byte header + 4 byte in free table or 8 byte in queue.
Queue sizes adjust 

Header:
prev + next offsets.
start of mmap chunk sets prev to 0
end of mmap chunk is a header with next = 0.


================================================
FILE: malloc2.cpp
================================================
/**
 *   Each thread has its own 'arena' where it can allocate 'new' blocks of what ever size it needs (buckets). After
 *   a thread is done with memory it places it in a garbage collection queue.
 *
 *   The garbage collector follows each threads trash bin and moves the blocks into a recycled list that
 *   all other threads can pull from.
 *
 *   The garbage collector can grow these queues as necessary and shrink them as time progresses.
 */

#include <vector>
//#include "mmap_alloc.hpp"
#include "disruptor.hpp"
#include <thread>
#include "fast_rand.cpp"

using namespace disruptor;

#define PAGE_SIZE (4*1024*1024)
#define BENCH_SIZE ( (2024) )
#define ROUNDS 200000 
#define LOG2(X) ((unsigned) (8*sizeof (unsigned long long) - __builtin_clzll((X)) - 1))

struct block_header
{
   uint32_t   _page_pos; // how far from start of page
   uint32_t   _prev;
   uint32_t   _next;
   uint32_t   _timestamp;// creation time... we want to use 'old blocks' first
                         // because they are most likley to contain long-lived objects
   size_t calc_size(){ return _next - _page_pos;   }
   int calc_bin_num(){ return LOG2(calc_size())+1; }
};
block_header* allocate_block_page();

/**
 *  2MB chunk of memory that gets divided up
 *  'on request', rounded to the nearest multiple
 *  of 128 bytes so that it can be binned/cached
 *  effectively.
 */
struct page
{
  block_header   data[PAGE_SIZE/sizeof(block_header)]; 
};

class thread_allocator
{
  public:
    void    free( char* c )
    {
      block_header* b = reinterpret_cast<block_header*>(c) - 1;
      int bin = b->calc_bin_num();
      if( _cache_pos[bin] > _cache_end[bin] - 32 )
      {
         _cache[bin].at(_cache_end[bin]++) = c;
         return;
      }
      
      auto pos = _gc_read_end_buffer;
      _garbage_bin.at(pos) = c;
      _gc_read_end_buffer = pos + 1;
      /*
      _gc_read_end_buffer = pos + 1;
      */
      if( _gc_read_end_buffer - _gc_read_end_last_write > 10 )
      {
        _gc_read_end = _gc_read_end_last_write = _gc_read_end_buffer;
      }
    }

    char*   alloc( size_t s );

    static thread_allocator& get()
    {
        static __thread thread_allocator* tld = nullptr;
        if( !tld )  // new is not an option
        { 
            tld = reinterpret_cast<thread_allocator*>( malloc(sizeof(thread_allocator))/*mmap_alloc( sizeof(thread_allocator)*/ );
            tld = new (tld) thread_allocator(); // inplace construction

            // TODO: allocate  pthread_threadlocal var, attach a destructor /clean up callback
            //       to that variable... 
        }
        return *tld;
    }

  protected:
    char*  split_chunk( char* c, size_t l );

    thread_allocator();
    ~thread_allocator();

    friend class garbage_collector;
    
    int64_t           _gc_begin;               // how far has gc processed
    int64_t           _pad[7];                 // save the cache lines/prevent false sharing
    int64_t           _gc_read_end;            // how far can gc read
    int64_t           _pad2[7];                // save the cache lines/prevent false sharing
    int64_t           _gc_read_end_buffer;     // cache writes to gc_read_end to every 10 writes
    int64_t           _gc_read_end_last_write; // cache writes to gc_read_end to every 10 writes
    int64_t           _cache_pos[32];
    int64_t           _cache_end[32];

    char*   get_garbage( int64_t pos ) // grab a pointer previously claimed.
    {
      // we may have to dynamically reallocate our gbin
      return _garbage_bin.at(pos);
    }
    block_header*               _next_block;
    ring_buffer<char*,1024*8>   _garbage_bin;
    ring_buffer<char*,4>        _cache[32];
};


typedef thread_allocator* thread_alloc_ptr;


/**
 *   Polls all threads for freed items.
 *   Upon receiving a freed item, it will look
 *   at its size and move it to the proper recycle
 *   bin for other threads to consume.
 *
 *   When there is less work to do, the garbage collector
 *   will attempt to combine blocks into larger blocks
 *   and move them to larger cache sizes until it
 *   ultimately 'completes a page' and returns it to
 *   the system.  
 *
 *   From the perspective of the 'system' an alloc
 *   involves a single atomic fetch_add.
 *
 *   A free involves a non-atomic store.
 *
 *   No other sync is necessary.
 */
class garbage_collector
{
  public:
    garbage_collector();
    ~garbage_collector();
    /**
     *  Handles objects of the same size.
     */
    class recycle_bin
    {
       public:
          recycle_bin(int num = 0)
          :_next_write(0),_write_pos(0),_read_pos(0),_bin_num(num)
          {
          }
          void sync_write_pos()
          {
     //       ((std::atomic<int64_t>*)&_write_pos)->load();
          }

          int64_t                       _next_write;
          int64_t                       _pad0[7];
          int64_t                       _write_pos;
          int64_t                       _pad[7];
          std::atomic<int64_t>          _read_pos;
          int64_t                       _pad2[7];
          ring_buffer<char*,1024*256>   _free_bin;
          int                           _bin_num;
    };

    std::atomic<int64_t>  _sync;

    int get_bin_num( size_t s )
    {
      return LOG2(s)+1;
    }

    recycle_bin&  get_bin( size_t bin_num ) 
    { 
        assert( bin_num < 32 );
        return _bins[bin_num];
    }

    void register_allocator( thread_alloc_ptr ta );
    void unregister_allocator( thread_alloc_ptr ta );

    static garbage_collector& get()
    {
        static garbage_collector gc;
        return gc;
    }
  private:
    static void  run();
    void  recycle( char* c );

    std::thread                _thread;
    recycle_bin                _bins[32];
    std::atomic<uint32_t>      _next_talloc;
    thread_alloc_ptr           _tallocs[128];
    static std::atomic<bool>   _done;
};
std::atomic<bool> garbage_collector::_done(false);

garbage_collector::garbage_collector()
:_thread( &garbage_collector::run )
{
  memset( _tallocs, 0, sizeof(_tallocs) );
}
garbage_collector::~garbage_collector()
{
  _done.store(true, std::memory_order_release );
  _thread.join();
}

void garbage_collector::register_allocator( thread_alloc_ptr ta )
{
  printf( "registering thread allocator %p\n", ta );
  // TODO: just lock here... 
  auto pos = _next_talloc.fetch_add(1);
  _tallocs[pos] = ta;
}
void garbage_collector::unregister_allocator( thread_alloc_ptr ta )
{
  for( int i = 0; i < 128; ++i )
  {
    if( _tallocs[i] == ta ) 
    {
      _tallocs[i] = nullptr;
    }
  }
}

void  garbage_collector::run()
{
    garbage_collector& self = garbage_collector::get();
    while( true )
    {
        bool found_work = false;
        for( int i = 0; i < 128; i++ )
        {
             // TODO: not safe assumption, threads can come/go at will
             // leaving holes... thread cleanup code needs locks around it
             // to prevent holes..
            if( self._tallocs[i] != nullptr ) 
            {
                auto b = self._tallocs[i]->_gc_begin;
                auto e = self._tallocs[i]->_gc_read_end;

                if( b != e ) found_work = true;
                for( auto p = b; p < e; ++p )
                {
                    char* c = self._tallocs[i]->get_garbage(p);


                    self.recycle( c);
                }
                self._tallocs[i]->_gc_begin = e; 
            }
        }
        if( !found_work ) 
        {
        //  usleep(0);
            if( _done.load( std::memory_order_acquire ) ) return;
        }
    }
}

void garbage_collector::recycle( char* c )
{
   block_header* h = ((block_header*)c)-1;
   assert( h->_next - h->_page_pos > 0 );
   recycle_bin& b = get_bin( get_bin_num(h->_next - h->_page_pos)  );
   auto p = b._next_write++;
   while( b._free_bin.at(p) != nullptr )
   {
//      fprintf( stderr, "opps.. someone left something behind...\n" );
      p = b._next_write++;
   }
   b._free_bin.at(p) = c;
   b._write_pos = p;
//   if( b._write_pos % 256 == 128 ) 
 //     b.sync_write_pos();
}

block_header* allocate_block_page()
{
    fprintf( stderr, "#" );
    auto limit = malloc(PAGE_SIZE);//mmap_alloc( PAGE_SIZE );

    block_header* _next_block = reinterpret_cast<block_header*>(limit);
    _next_block->_page_pos = 0;
    _next_block->_prev = 0;
    _next_block->_next = PAGE_SIZE; // next block always goes to end...; 
    _next_block->_timestamp = 0; // TODO... 
    return _next_block;
}


thread_allocator::thread_allocator()
{
  _gc_begin = 0;
  _gc_read_end = 0;
  _gc_read_end_buffer = 0;
  _gc_read_end_last_write = 0;
  _next_block = allocate_block_page();
  memset( _cache_pos, 0, sizeof(_cache_pos) );
  memset( _cache_end, 0, sizeof(_cache_end) );

  garbage_collector::get().register_allocator(this);
}

thread_allocator::~thread_allocator()
{
  // give the rest of our allocated chunks to the gc thread
  free( reinterpret_cast<char*>(_next_block+1) ); 
  garbage_collector::get().unregister_allocator(this);

  // GARBAGE COLLECTOR must do the mmap free because we don't know
  // when it will notice this thread going away... 
  // TODO: post a message to GC to track thread cleanup.
  
  // mmap_free( this, sizeof(*this) );
}

/**
 *  returns len bytes starting at s, potentially freeing 
 *  anything after s+len.
 */
char* thread_allocator::split_chunk( char* s, size_t len )
{
  return s; 
}

char* thread_allocator::alloc( size_t s )
{
    assert( s > 0 );
    s = 64*((s + 63)/64); // multiples of 64 bytes

    if( s+sizeof(block_header) >= PAGE_SIZE  )
    {
       assert( false );
       // do direct mmap 
      return nullptr;
    }
    int bin_num = garbage_collector::get().get_bin_num( s );

    int limit = std::min<int>(bin_num + 4,32);
    for( int i = bin_num; i < limit; ++i )
    {
      if( _cache_pos[i] < _cache_end[i] )
      {
         char* c = _cache[i].at(_cache_pos[i]);
         ++_cache_pos[i];

         return split_chunk( c, s );
      }
    }
    static int64_t hit = 0;
    static int64_t miss = 0;
    static int64_t sync_count = 0;
    ++sync_count;
 //   if( sync_count % 64  == 63 ) 
 //       rb->sync_write_pos();


    int end_bin = bin_num+1;// + 4;
    for( ; bin_num < end_bin; ++ bin_num )
    {
       garbage_collector::recycle_bin* rb = &garbage_collector::get().get_bin( bin_num );
       while( rb )
       {
          // TODO: ATOMIC ... switch to non-atomic check
          auto write_pos = rb->_write_pos;
         // printf( "recyclebin wirte_pos: %d  read_cur.begin %d\n", write_pos, rb->_read_cur.pos().aquire()  );
       
          auto avail = write_pos - *((int64_t*)&rb->_read_pos);
          if(  avail > 16 )// /*.load( std::memory_order_relaxed )*/ < write_pos )
          {
             // ATOMIC CLAIM FROM SHARED POOL... MOST EXPENSIVE OP WE HAVE...
             //auto pos = rb->_read_cur.pos().atomic_increment_and_get(1)-1;
             //auto pos = rb->_read_pos.fetch_add(4,std::memory_order_relaxed);
             auto pos = rb->_read_pos.fetch_add(8);//,std::memory_order_acquire);
             auto e = pos + 8;
             while( pos < e )
             {
                char* b = rb->_free_bin.at(pos);
                if( b )
                {
                   _cache[bin_num].at(_cache_end[bin_num]++) = b;
                   rb->_free_bin.at(pos) = nullptr;
                } 
                else
                {
         //         fprintf( stderr, "read too much..\n" );
                }
                ++pos;
             }
       
             if( _cache_pos[bin_num] < _cache_end[bin_num] )
             {
                char* c = _cache[bin_num].at(_cache_pos[bin_num]);
                ++_cache_pos[bin_num];
                ++hit;
                return c;
             }
          } // else there are no blocks our size... go up a size or two?..
          break;
       }
       ++miss;
   //    if( miss % 10000 == 0 ) fprintf( stderr, "\nHit: %lld    Miss: %lld          \r", hit, miss );
    }
    // we already checked the 'best fit' bin and failed to find 
    // anything that size ready, so we can allocate it from our 
    // thread local block

 //   printf( "allocating new chunk from thread local page\n" );

    // make sure the thread local block has enough space...
    if( _next_block->_page_pos + s + sizeof(block_header) >= PAGE_SIZE )
    {
        // not enough space left in current block.. free it... if it has any space at all.
        if( _next_block->_page_pos != PAGE_SIZE )
        {
            free( (char*)(_next_block+1) );
        }

        _next_block = allocate_block_page();
        assert( _next_block != nullptr );
    }
   // fprintf( stderr, "alloc %d   at block pos %d\n", s+1, _next_block->_page_pos );

    block_header* new_b   = _next_block;
    _next_block = new_b + 1 + s/sizeof(block_header);

    _next_block->_page_pos  = new_b->_page_pos + sizeof(block_header) + s;
    _next_block->_prev      = new_b->_page_pos; 
    _next_block->_next      = PAGE_SIZE; // next block always goes to end...
    _next_block->_timestamp = new_b->_timestamp; // TODO...

    new_b->_next            = _next_block->_page_pos;
    
    // our work here is done give them the newly allocated block (pointing after the header
    return reinterpret_cast<char*>(new_b+1);
}

char* malloc2( int s )
{
  return thread_allocator::get().alloc(s);
}

void  free2( char* s )
{
  return thread_allocator::get().free(s);
}


/*  SEQUENTIAL BENCH
int main( int argc, char** argv )
{
  if( argc == 2 && argv[1][0] == 'S' )
  {
     printf( "malloc2\n");
     for( int i = 0; i < 50000000; ++i )
     {
        char* test = malloc2( 128 );
        assert( test != nullptr );
        test[0] = 1;
        free2( test );
     }
  }
  if( argc == 2 && argv[1][0] == 's' )
  {
     printf( "malloc\n");
     for( int i = 0; i < 50000000; ++i )
     {
        char* test = (char*)malloc( 128 );
        assert( test != nullptr );
        test[0] = 1;
        free( test );
     }
  }
  fprintf( stderr, "done\n");
 // sleep(5);
  return 0;
}
*/

/* RANDOM BENCH */
std::vector<char*>  buffers[16];
void pc_bench_worker( int pro, int con, char* (*do_alloc)(int s), void (*do_free)(char*)  )
{
  for( int r = 0; r < ROUNDS; ++r )
  {
      for( int x = 0; x < buffers[pro].size()/2 ; ++x )
      {
         uint32_t p = fast_rand() % buffers[pro].size();
         if( !buffers[pro][p] )
         {
           uint64_t si = 32 + fast_rand()%(8096*16); //4000;//32 + fast_rand() % (1<<16);
           auto r = do_alloc( si );
           assert( r != nullptr );
         //  assert( r[0] != 99 ); 
         //  r[0] = 99; 
           buffers[pro][p] = r;
         }
      }
      for( int x = 0; x < buffers[con].size()/2 ; ++x )
      {
         uint32_t p = fast_rand() % buffers[con].size();
         assert( p < buffers[con].size() );
         assert( con < 16 );
         assert( con >= 0 );
         if( buffers[con][p] ) 
         { 
           //assert( buffers[con][p][0] == 99 ); 
          // buffers[con][p][0] = 0; 
           do_free(buffers[con][p]);
           buffers[con][p] = 0;
         }
      }
  }
}


void pc_bench(int n, char* (*do_alloc)(int s), void (*do_free)(char*)  )
{
  for( int i = 0; i < 16; ++i )
  {
    buffers[i].resize( BENCH_SIZE );
    memset( buffers[i].data(), 0, 8 * BENCH_SIZE );
  }

  std::thread* a = nullptr;
  std::thread* b = nullptr;
  std::thread* c = nullptr;
  std::thread* d = nullptr;
  std::thread* e = nullptr;
  std::thread* f = nullptr;
  std::thread* g = nullptr;
  std::thread* h = nullptr;
  std::thread* i = nullptr;
  std::thread* j = nullptr;


 int s = 1;
  switch( n )
  {
     case 10:
     a = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 9:
      b = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 8:
      c = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 7:
      d = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 6:
     e = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 5:
     f = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 4:
      g = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 3:
      h = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 2:
      i = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
     n--;
     s++;
     case 1:
      j = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } );
  }
  if(a)
  a->join();
  if(b)
  b->join();
  if(c)
  c->join();
  if(d)
  d->join();
  if(e)
  e->join();
  if(f)
  f->join();
  if(g)
  g->join();
  if(h)
  h->join();
  if(i)
  i->join();
  if(j)
  j->join();

}
void pc_bench_st(char* (*do_alloc)(int s), void (*do_free)(char*)  )
{
  for( int i = 0; i < 16; ++i )
  {
    buffers[i].resize( BENCH_SIZE );
    memset( buffers[i].data(), 0, 8 * BENCH_SIZE );
  }
  int i = 0;
  std::thread a( [=](){ pc_bench_worker( 1, 1, do_alloc, do_free ); } );
  a.join();
}
#include <tbb/scalable_allocator.h>

char* do_malloc(int s)
{ 
//    return (char*)::malloc(s); 
   return (char*)scalable_malloc(s);
}
void  do_malloc_free(char* c)
{ 
    scalable_free(c);
  // ::free(c); 
}

int main( int argc, char** argv )
{
  if( argc > 2 && argv[1][0] == 'm' )
  {
    std::cerr<<"malloc multi\n";
    pc_bench( atoi(argv[2]), do_malloc, do_malloc_free );
  }
  if( argc > 2 && argv[1][0] == 'M' )
  {
    std::cerr<<"hash malloc multi\n";
    pc_bench( atoi(argv[2]), malloc2, free2 );
  }
  if( argc > 1 && argv[1][0] == 's' )
  {
    std::cerr<<"malloc single\n";
    pc_bench_st( do_malloc, do_malloc_free );
  }
  if( argc > 1 && argv[1][0] == 'S' )
  {
    std::cerr<<"hash malloc single\n";
    pc_bench_st( malloc2, free2 );
  }
  return 0;
}


================================================
FILE: malloc2.hpp
================================================


================================================
FILE: malloc3.cpp
================================================
/**
 *   Each thread has its own 'arena' where it can allocate 'new' blocks of what ever size it needs (buckets). After
 *   a thread is done with memory it places it in a garbage collection queue.
 *
 *   The garbage collector follows each threads trash bin and moves the blocks into a recycled list that
 *   all other threads can pull from.
 *
 *   The garbage collector can grow these queues as necessary and shrink them as time progresses.
 */

#include <vector>
#include <unordered_set>

#include "mmap_alloc.hpp"
#include "disruptor.hpp"
#include <thread>
#include <stdint.h>
#include <memory.h>
#include <stdlib.h>
#include <iostream>
#include <vector>
#include <assert.h>
#include <unistd.h>
#include <iostream>
#include <sstream>
//#include "rand.cpp"


using namespace disruptor;

#define PAGE_SIZE (4*1024*1024)
#define BENCH_SIZE ( (1024) )
#define ROUNDS 200000
#define LOG2(X) ((unsigned) (8*sizeof (unsigned long long) - __builtin_clzll((X)) - 1))
#define NUM_BINS 32 // log2(PAGE_SIZE)

class block_header
{
   public:
      block_header* next()
      { 
         assert(this);
         if( _size > 0 ) return reinterpret_cast<block_header*>(data()+_size); 
         else return nullptr;
      }
      block_header* prev()
      { 
         assert(this);
         if( _prev_size <= 0 ) return nullptr;
         return reinterpret_cast<block_header*>(reinterpret_cast<char*>(this) - _prev_size - 8);
      }

      enum flags_enum
      {
         unknown  = 0,
         idle     = 1, // in storage, mergable
         queued   = 2, // in waiting queue...
         cached   = 4, // cached in thread
         active   = 8, // in use by app
         mergable = 16 // track this or will false sharing kill me?
      };

      struct queue_state // the block is serving as a linked-list node
      {
          block_header* next;
          block_header* prev;
      };

      void set_state( flags_enum e )
      {
         _flags = e;
      }
      flags_enum get_state() { return (flags_enum)_flags; }

      queue_state& as_queue_node()
      {
         return *reinterpret_cast<queue_state*>(data());
      }

      queue_state& init_as_queue_node()
      {
         // _flags |= queued;
         queue_state& s = as_queue_node();
         s.next = nullptr;
         s.prev = nullptr;
         return s;
      }


      void init( int s )
      {
         _prev_size = 0;
         _size = - (s-8);
      }

      char*         data()      { return ((char*)this)+8; }
      int           size()const { return abs(_size); }

      int raw_size()const { return _size; }
      int raw_prev_size()const { return _prev_size; }


      int        calc_forward_extent()
      {
         // fprintf( stderr, "pos %p + %d  -> ", this, _size );
          int s = size() + 8;
          auto n = next();
          if( n ) s += n->calc_forward_extent();
          return s;
      }

      int       page_size()
      {
          auto h = head();
          assert(h);
          return head()->calc_forward_extent(); 
      }
      block_header*       head()
      {
          auto pre = prev();
          if( !pre ) return this;
          do {
            auto next_prev = pre->prev();
            if( !next_prev ) return pre;
            pre = next_prev;
          } while ( true );
      }

      /** create a new block at p and return it */
      block_header* split_after( int s )
      {
         assert( s >= 32 );
//         fprintf( stderr, "prev_size %d  _size %d  Initial Error: %d\n", _prev_size, _size, int(PAGE_SIZE - this->page_size()) );
         assert( PAGE_SIZE == page_size() );
         
         if(  (size() - 8 -32) < s ) return nullptr;// no point in splitting to less than 32 bytes

         block_header* n = reinterpret_cast<block_header*>(data()+s);
         n->_prev_size   = s;
         n->_size        = size() -s -8;
         
         if( _size < 0 ) 
            n->_size = -n->_size; // we just split the tail

         _size = s; // this node now has size s
         assert( size() >= s );
         assert( PAGE_SIZE == n->page_size() );
         assert( PAGE_SIZE == page_size() );
         return n;
      }

      // merge this block with next, return head of new block.
      block_header* merge_next()
      {
         assert( PAGE_SIZE == page_size() );
         assert( _flags == block_header::idle );
         auto nxt = next();
         if( !nxt ) return this;
         assert( nxt->page_size() == PAGE_SIZE );

         // next must be in the idle state
         if( nxt->_flags != idle ) return this;

         // extract node from the double link list it is in.
         queue_state& qs = nxt->as_queue_node();
         if( qs.next )
         {
      //      assert( qs.next->as_queue_node().prev == nxt );
            qs.next->as_queue_node().prev = qs.prev;
         }

         if( qs.prev )
         {
       //     assert( qs.prev->as_queue_node().next == nxt );
            qs.prev->as_queue_node().next = qs.next;
         }

         // now we are free to merge the memory
         _size += nxt->size() + 8;
         fprintf( stderr, "merged to size %d\n", _size );
         if( nxt->_size < 0 ) _size = -_size;

         nxt = next(); // find the new next.
         if( nxt )
         {
           nxt->_prev_size = size();
         }
         assert( PAGE_SIZE == page_size() );
         if( next() ) assert( PAGE_SIZE == next()->page_size() );
         if( prev() ) assert( PAGE_SIZE == prev()->page_size() );
         return this;
      }

      // merge this block with the prev, return the head of new block
      block_header* merge_prev() 
      {
         _flags = idle; // mark myself as idle/mergable
         auto p = prev();
         if( !p ) return this;
         if( p->_flags != idle ) return this;
         return p->merge_next();
      }

   private:
      int32_t   _prev_size; // size of previous header.
      int32_t   _size:24; // offset to next, negitive indicates tail, 8 MB max, it could be neg
      int32_t   _flags:8; // offset to next, negitive indicates tail
};
static_assert( sizeof(block_header) == 8, "Compiler is not packing data" );

/** returns a new block page allocated via mmap 
 *  The page has 2 block headers (head+tail) defined
 *  and head is returned.
 **/
block_header* allocate_block_page();

struct block_list_node
{
    block_list_node():next(nullptr){};
    block_list_node* next;

    block_header*    header()
    {
      return  reinterpret_cast<block_header*>(reinterpret_cast<char*>(this)-8);
    }

    int count()
    {
      int count = 1;
      auto n = next;
      while( n )
      {
        ++count;
        assert( count < 1000 );
        n = n->next;
      }
      return count;
    }

    block_list_node* find_end() 
    {
       block_list_node* n = this;
       while( n->next )
       {
          n = n->next;
       }
       return n;
    }
};


class thread_allocator
{
  public:
    char*   alloc( size_t s );

    void    free( char* c )
    {
        auto node = reinterpret_cast<block_header*>(c-8); // store a point
        node->init_as_queue_node().next = _gc_on_deck;
        if( !_gc_at_bat )
        {
           _gc_at_bat = node;
           _gc_on_deck = nullptr;
        }
        else
        {
           _gc_on_deck = node;
        }
    }

    static thread_allocator& get()
    {
        static __thread thread_allocator* tld = nullptr;
        if( !tld )  // new is not an option
        { 
            tld = reinterpret_cast<thread_allocator*>( mmap_alloc( sizeof(thread_allocator) ) );
            tld = new (tld) thread_allocator(); // inplace construction

            // TODO: allocate  pthread_threadlocal var, attach a destructor /clean up callback
            //       to that variable... 
        }
        return *tld;
    }

    void print_cache()
    {
       for( int i = 0; i < NUM_BINS; ++i )
       {
          fprintf( stderr, "%d]  size %d   \n", i, _bin_cache_size[i] );
       }
    }

  protected:

    bool          store_cache( block_header* h )
    {
      assert( h->page_size() == PAGE_SIZE );
       auto bin = LOG2( h->size() );
      if( _bin_cache[bin] == nullptr )
      {
        _bin_cache[bin] = h;
        return true;
      }
      return false;
      /*
       assert( h != nullptr );

       if( _bin_cache_size[bin] < 4 )
       {
          if( _bin_cache_size[bin] == 0 ) assert( nullptr == _bin_cache[bin] );

          block_list_node* bln = reinterpret_cast<block_list_node*>(h->data() );
          bln->next = _bin_cache[bin];
          _bin_cache[bin] = bln;
          _bin_cache_size[bin]++;
          assert( _bin_cache_size[bin] == _bin_cache[bin]->count() );
          return true;
       }
       fprintf( stderr, "cache full bin %d size %d", bin, _bin_cache_size[bin] );
       assert( _bin_cache[bin] != nullptr );
       return false;
       */
    }

    block_header* fetch_cache( int bin )
    {
       if( _bin_cache[bin] )
       {
         block_header* b = _bin_cache[bin];
         assert( b->page_size() == PAGE_SIZE );
         _bin_cache[bin] = nullptr;
         return b;
       }
       return nullptr;
      /*
       if( _bin_cache_size[bin] > 0 )
       {
          assert( _bin_cache_size[bin] == _bin_cache[bin]->count() );
          assert( _bin_cache[bin] );
          auto h = _bin_cache[bin];
          _bin_cache[bin] = h->next;
          _bin_cache_size[bin]--;
          auto head = h->header();
          assert( head->page_size() == PAGE_SIZE );
          assert( LOG2(head->size()) >= bin );
          assert( LOG2(head->size()) == bin );
          return head;
       }
       assert( !_bin_cache[bin] );
       */
       return nullptr;
    }


    block_header* fetch_block_from_bin( int bin );

    thread_allocator();
    ~thread_allocator();

    friend class garbage_collector;
    bool                         _done;       // cleanup and remove from list.
    std::atomic<block_header*>   _gc_at_bat;  // where the gc pulls from.
    uint64_t                     _gc_pad[7];  // gc thread and this thread should not false-share these values
    block_header*                _gc_on_deck; // where we save frees while waiting on gc to bat.

    /** 
     * called by gc thread and pops the at-bat free list
     */
    block_header*  get_garbage() // grab a pointer previously claimed.
    {
      if( block_header* gar = _gc_at_bat.load() )
      {
         _gc_at_bat.store(nullptr);// = nullptr;
         return gar;
      }
      return nullptr;
    }
    block_header*               _bin_cache[NUM_BINS];      // head of cache for specific bin
    int16_t                     _bin_cache_size[NUM_BINS]; // track num of nodes in cache

    thread_allocator*           _next; // used by gc to link thread_allocs together
};


typedef thread_allocator* thread_alloc_ptr;


/**
 *   Polls all threads for freed items.
 *   Upon receiving a freed item, it will look
 *   at its size and move it to the proper recycle
 *   bin for other threads to consume.
 *
 *   When there is less work to do, the garbage collector
 *   will attempt to combine blocks into larger blocks
 *   and move them to larger cache sizes until it
 *   ultimately 'completes a page' and returns it to
 *   the system.  
 *
 *   From the perspective of the 'system' an alloc
 *   involves a single atomic fetch_add.
 *
 *   A free involves a non-atomic store.
 *
 *   No other sync is necessary.
 */
class garbage_collector
{
  public:
    garbage_collector();
    ~garbage_collector();

    class recycle_bin
    {
       public:
          recycle_bin()
          :_read_pos(0),_full_count(0),_full(2),_write_pos(0)
          {
             memset( &_free_queue, 0, sizeof(_free_queue) );
             _free_list = nullptr;
          }

          // read the _read_pos without any atomic sync, we only care about an estimate
          int64_t available()                            { return _write_pos - *((int64_t*)&_read_pos); }
          // reserve right to read the next num spots from buffer
          int64_t claim( int64_t num )                   { return _read_pos.fetch_add(num); }
          block_header* get_block( int64_t claim_pos )   { return _free_queue.at(claim_pos); }
          void          clear_block( int64_t claim_pos ) { _free_queue.at(claim_pos) = nullptr; }

          // determines how many chunks should be required to consider this bin full.
          // TODO: this method needs to be tweaked to factor in 'time'... as it stands
          // now the GC loop will be very agressive at shrinking the queue size
          int64_t       check_status()
          {
              return 8 - available();
            /*
              auto av = available();
              int consumed = _last_fill - av;
              if( consumed > _last_fill/2 ) ++_full;

              if( av <= 0 )
              {
                 // apparently there is high demand, the consumers cleaned us out.
                 _full *= 2; // exponential growth..
                 _full = std::min( _full+4, _free_queue.get_buffer_size() -1 );
                 fprintf( stderr, "%d  blocks available,   _full %d\n", int(av), int(_full) );
              }
              else if( av == _full )
              {
                 // apparently no one wanted any... we should shrink what we consider full
                 _full -= 4; // fast back off
                 if( _full < 2 ) _full = 2;
              }
              else // av < _full
              {
                 // some, but not all have been consumed... 
                 // if less than half have been consumed... reduce size,
                 // else keep the size the same.
                 if( av > _full/2 )
                 {
                     _full--; // reduce full size,slow back off
                     if( _full < 2 ) _full = 2;
                     return  _full - av; 
                 }
                 else // more than half consumed... keep full size the same, refill
                 {
                 }
              }
               fprintf( stderr, "%d  blocks available,   _full %d  post %d\n", int(av), int(_full), int(_full-av) );
              return _full - av; 
              */
          }


          ring_buffer<block_header*,128>         _free_queue; 
          std::atomic<int64_t>                  _read_pos; //written to by read threads
          int64_t _pad[7];     // below this point is written to by gc thread
          int64_t _full_count; // how many times gc thread checked and found the queue full
          int64_t _full;       // limit the number of blocks kept in queue
          int64_t _write_pos;  // read by consumers to know the last valid entry.
          int64_t _last_fill;  // status of the buffer at the last check.

          void push( block_header* h )
          {
             h->set_state( block_header::idle );
             block_header::queue_state& qs = h->init_as_queue_node(); 
             qs.next = _free_list;
             if( _free_list ) 
             {
                _free_list->as_queue_node().prev = h;
             }
             _free_list = h;
          }

          block_header* pop()
          {
              auto tmp = _free_list;
              if( _free_list ) 
              {
                 auto n = _free_list->as_queue_node().next;
                 if( n ) 
                    n->as_queue_node().prev = nullptr;
                 _free_list = n;
                 assert( tmp->get_state() == block_header::idle );
                 tmp->set_state( block_header::unknown ); // TODO: only if DEBUG
              }
              return tmp;
          }

          // blocks are stored as a double-linked list
          block_header* _free_list;
    };

    recycle_bin& find_cache_bin_for( block_header* h ) 
    { 
      assert(h!=nullptr);
      int bn = get_bin_num(h->size());
  //    fprintf( stderr,  "block header size %d  is cached in bin %d holding sizes %d\n", (int)h->size(), bn, (1<<(bn)) );
      return get_bin(get_bin_num( h->size() )); 
    }

    int get_bin_num( size_t s )
    {
      return LOG2(s);
    }

    recycle_bin&  get_bin( size_t bin_num ) 
    { 
        assert( bin_num < NUM_BINS );
        return _bins[bin_num];
    }

    void register_allocator( thread_alloc_ptr ta );

    static garbage_collector& get()
    {
        static garbage_collector gc;
        return gc;
    }
  private:
    static void  run();
    // threads that we are actively looping on
    std::atomic<thread_alloc_ptr> _thread_head;

    std::thread                _thread; // gc thread.. doing the hard work
    recycle_bin                _bins[NUM_BINS];


    static std::atomic<bool>   _done;
};
std::atomic<bool> garbage_collector::_done(false);

garbage_collector::garbage_collector()
:_thread_head(nullptr),_thread( &garbage_collector::run )
{
   fprintf( stderr, "allocating garbage collector\n" );
}
garbage_collector::~garbage_collector()
{
  _done.store(true, std::memory_order_release );
  _thread.join();
}

void garbage_collector::register_allocator( thread_alloc_ptr ta )
{
  printf( "registering thread allocator %p\n", ta );

  auto* stale_head = _thread_head.load(std::memory_order_relaxed);
  do { ta->_next = stale_head;
  }while( !_thread_head.compare_exchange_weak( stale_head, ta, std::memory_order_release ) );
}

void  garbage_collector::run()
{
    fprintf( stderr, "Starting GC loop\n");
    try
    {
      garbage_collector& self = garbage_collector::get();
      while( true )
      {
          thread_alloc_ptr cur_al = *((thread_alloc_ptr*)&self._thread_head);
          bool found_work = false;
      
          // for each thread, grab all of the free chunks and move them into
          // the proper free set bin, but save the list for a follow-up merge
          // that takes into consideration all free chunks.
          while( cur_al )
          {
              auto cur = cur_al->get_garbage();
              
              if( cur )
              {
                assert( cur->page_size() == PAGE_SIZE );
                found_work = true; 
              }
              
              while( cur )
              {
                  assert( cur->page_size() == PAGE_SIZE );
                  block_header* nxt = cur->as_queue_node().next;
                  assert( nxt != cur );
                  if( nxt ) assert( nxt->page_size() == PAGE_SIZE );

                  assert( cur->page_size() == PAGE_SIZE );
                  auto before = cur->size();
                //  fprintf( stderr, "found free block of size: %d\n", cur->size() );
                  cur->init_as_queue_node();
                  assert( cur->page_size() == PAGE_SIZE );
                  cur->set_state( block_header::idle );
                  assert( cur->page_size() == PAGE_SIZE );

                  cur = cur->merge_next();
               //   cur = cur->merge_prev();
                  if( before != cur->size() )
                  fprintf( stderr, "found free block of after merges..: %d\n", cur->size() );
              
                  assert( cur->page_size() == PAGE_SIZE );
                  recycle_bin& c_bin = self.find_cache_bin_for(cur);
                  assert( cur->page_size() == PAGE_SIZE );
              //    fprintf( stderr, "pushing into bin\n" );
                  c_bin.push(cur); 
                  assert( cur->page_size() == PAGE_SIZE );
              
                  cur = nxt;
                  assert( cur->page_size() == PAGE_SIZE );
              }

              assert( cur_al != cur_al->_next );
              // get the next thread.
              cur_al = cur_al->_next;
          }
      
          // for each recycle bin, check the queue to see if it
          // is getting low and if so, put some chunks in play
          for( int i = 0; i < NUM_BINS; ++i )
          {
              garbage_collector::recycle_bin& bin = self._bins[i];
              auto needed = bin.check_status(); // returns the number of chunks need
              if( needed > 0 )
              {
                  int64_t next_write_pos = bin._write_pos;
                  block_header* next = bin.pop();

                  while( next && needed > 0 )
                  {
                   //   fprintf( stderr, "poping block from bin %d and pushing into queue\n", i );
                      found_work = true;
                      ++next_write_pos;
                      if( bin._free_queue.at(next_write_pos) )
                      {
                          // someone left something behind... 
                      }
                      else
                      {
                          bin._free_queue.at(next_write_pos) = next;
                          next = bin.pop();
                      }
                      --needed;
                  }
                  if( next ) bin.push(next); // leftover... 
                  bin._write_pos = next_write_pos;
              }
              else if( needed < 0 )
              {
                // apparently no one is checking this size class anymore, we can reclaim some nodes.
                // TODO:  perhaps we only do this if there is no other work found as work implies
                // that the user is still allocating / freeing objects and thus we don't want to
                // compete to start freeing cache yet... 
              }
          }
          if( !found_work ) usleep( 1000 );
      
          if( _done.load( std::memory_order_acquire ) ) return;
          if( !found_work ) 
          {
              // reclaim cache
              // sort... and optimize....
          }
      }
    }
    catch ( ... )
    {
        fprintf( stderr, "gc caught exception\n" );
    }
    fprintf( stderr, "exiting gc loop\n" );
}


block_header* allocate_block_page()
{
    fprintf( stderr, "\n\n                                                                               ALLOCATING NEW PAGE\n\n" );
    auto limit = mmap_alloc( PAGE_SIZE );

    block_header* bl = reinterpret_cast<block_header*>(limit);
    bl->init( PAGE_SIZE );
    
    return bl;
}

thread_allocator::thread_allocator()
{
  _done            = false;
  _next            = nullptr;
  //_gc_at_bat       = nullptr;
  _gc_on_deck      = nullptr;

  memset( _bin_cache, 0, sizeof(_bin_cache) );
  memset( _bin_cache_size, 0, sizeof(_bin_cache_size) );
  garbage_collector::get().register_allocator(this);
}

thread_allocator::~thread_allocator()
{
  // give the rest of our allocated chunks to the gc thread
  // free all cache, free _alloc_block
  _done = true;
}

int get_min_bin( size_t s )
{
  return LOG2(s)+1;
}

char* thread_allocator::alloc( size_t s )
{
 //   fprintf( stderr, "    alloc %d\n", (int)s );
    if( s == 0 ) return nullptr;
    size_t data_size = s;

    // we need 8 bytes for the header, then round to the nearest
    // power of 2.
    int min_bin = LOG2(s+7)+1; // this is the bin size.
    s = (1<<min_bin)-8; // the data size is bin size - 8
    assert( s >= data_size );
    
    for( int bin = min_bin; bin < NUM_BINS; ++bin )
    {
        block_header* b = fetch_block_from_bin(bin);
        if( b )
        {
           fprintf( stderr, "found cache in bin %d\r", bin );
           assert( b->page_size() == PAGE_SIZE );
           block_header* tail = b->split_after( s );
           assert( b->page_size() == PAGE_SIZE );
           if( tail ) assert( tail->page_size() == PAGE_SIZE );
           assert( b->size() >= s );
           if( tail && !store_cache( tail ) ) 
           {
              fprintf( stderr, "unable to cache tail, free it\n" );
              this->free( tail->data() );
           }
           assert( b->size() >= s );
           return b->data();
        }
    }


    block_header* new_page = allocate_block_page();
    //printf( "      alloc new block page   %p  _size  %d _prev_size %d  next %p  prev %p\n",
      //    new_page, new_page->_size, new_page->_prev_size, new_page->next(), new_page->prev() );
    block_header* tail = new_page->split_after(s);
//    printf( "      alloc free tail  %p  _size  %d _prev_size %d  next %p  prev %p  tail %p\n",
 //         tail, tail->_size, tail->_prev_size, tail->next(), tail->prev(), tail );
    
    if( tail && !store_cache( tail ) )
    {
       this->free( tail->data() );
    }

    assert( new_page->size() >= s-8 );
    return new_page->data();
}

/**
 *  Checks our local bin first, then checks the global bin.
 *
 *  @return null if no block found in cache.
 */
block_header* thread_allocator::fetch_block_from_bin( int bin )
{
//    fprintf( stderr, "fetch cache %d  has %d items remaining\n", bin, int(_bin_cache_size[bin]) );    
    auto lo = fetch_cache(bin);
    if( lo ) return lo;
    assert( _bin_cache_size[bin] == 0 );

    garbage_collector& gc              = garbage_collector::get();
    garbage_collector::recycle_bin& rb = gc.get_bin( bin );

    if( auto avail = rb.available()  )
    {
        // claim up to half of the available, just incase 2
        // threads try to claim at once, they both can, but
        // don't hold a cache of more than 4 items
        auto claim_num = 2;//std::min<int64_t>( avail/2, 1 ); 
        // claim_num could now be 0 to 3
        //claim_num++; // claim at least 1 and at most 4

        // this is our one and only atomic 'sync' operation... 
        auto claim_pos = rb.claim( claim_num );
        auto claim_end = claim_pos + claim_num;
        bool found = false;
        while( claim_pos != claim_end )
        {
           block_header* h = rb.get_block(claim_pos);
           if( h )
           {
              found = true;
              rb.clear_block(claim_pos); // let gc know we took it. 
              ++claim_pos;
              if( claim_pos == claim_end )
              {
                  return h;
              }
              else if( !store_cache(h ) )
              {
                assert( !"unable to cache something we asked for!"  );
              }
           }
           else // oops... I guess 3 tried to claim at once...
           {
              ++claim_pos;
              // drop it on the floor and let the
              // gc thread pick it up next time through the
              // ring buffer.
           }
        }
        if( found ) 
        {
           fprintf( stderr, "apparently we were over drew the queue...\n" );
           return fetch_cache(bin); // grab it from the cache this time.
        }
    }
    return nullptr;
}

char* malloc2( int s )
{
  return thread_allocator::get().alloc(s);
}

void  free2( char* s )
{
  return thread_allocator::get().free(s);
}


#include "bench.cpp"


================================================
FILE: mmap_alloc.hpp
================================================
#pragma once
#include <algorithm>
extern "C" {
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <math.h>
}


size_t pagesize()
{
  return ::getpagesize();
}
size_t page_count( size_t s )
{
    return static_cast< size_t >( ceilf( static_cast< float >( s) / pagesize() ) );
}

char* mmap_alloc( size_t s, void* loc = 0 )
{
   //fprintf( stderr, "mmap_alloc %llu   %p\n", s, loc );
   const std::size_t pages( page_count(s) ); // add +1 for guard page
   std::size_t size_ = pages * pagesize();
   
   # if defined(macintosh) || defined(__APPLE__) || defined(__APPLE_CC__)
    void* limit = ::mmap( loc, size_, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_PRIVATE | MAP_ANON, -1, 0);
   # else
    const int fd( ::open("/dev/zero", O_RDONLY) );
    assert( -1 != fd);
    void* limit = ::mmap( loc, size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0);
   # endif
   if ( !limit ) throw std::bad_alloc();
   return static_cast<char*>(limit);
}

void mmap_free( void* pos, size_t s )
{
   const std::size_t pages( page_count( s) ); // add +1 for guard page
   std::size_t size_ = pages * pagesize();
   ::munmap( pos, size_);
}