Repository: bytemaster/fc_malloc Branch: master Commit: 7a56cf9eae24 Files: 19 Total size: 145.0 KB Directory structure: gitextract_3qg3aq5m/ ├── .gitignore ├── CMakeLists.txt ├── README.md ├── bench.cpp ├── bit_index.cpp ├── bit_index.hpp ├── disruptor.hpp ├── fast_rand.cpp ├── fc_heap.hpp ├── fc_malloc.cpp ├── fc_malloc.h ├── fixed_pool.hpp ├── garbage_collector.hpp ├── hheap.cpp ├── ideas.txt ├── malloc2.cpp ├── malloc2.hpp ├── malloc3.cpp └── mmap_alloc.hpp ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Compiled Object files *.slo *.lo *.o # Compiled Dynamic libraries *.so *.dylib # Compiled Static libraries *.lai *.la *.a ================================================ FILE: CMakeLists.txt ================================================ project( fc_malloc ) cmake_minimum_required( VERSION 2.8.8 ) IF( WIN32 ) ADD_DEFINITIONS( -DBOOST_CONTEXT_NO_LIB ) ADD_DEFINITIONS( -D_SCL_SECURE_NO_WARNINGS ) ADD_DEFINITIONS( -D_WIN32_WINNT=0x0501 ) ADD_DEFINITIONS( -D_CRT_SECURE_NO_WARNINGS ) ELSE(WIN32) SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x -Wall -Wno-unused-local-typedefs") ENDIF(WIN32) #add_executable( m3 malloc3.cpp ) add_executable( fheap bench.cpp ) target_link_libraries( fheap jemalloc ) ================================================ FILE: README.md ================================================ fc_malloc ========= Super Fast, Lock-Free, Wait-Free, CAS-free, thread-safe, memory allocator. Design ================== The key to developing fast multi-threaded allocators is eliminating lock-contention and false sharing. Even simple atomic operations and spin-locks can destroy the performance of an allocation system. The real challenge is that the heap is a multi-producer, multi-consumer resource where all threads need to read and write the common memory pool. With fc_malloc I borrowed design principles from the LMAX disruptor and assigned a dedicated thread for moving free blocks from all of the other threads to the shared pool. This makes all threads 'single producers' of free blocks and therefore it is possible to have a lock-free, wait-free per-thread free list. This also makes a single producer of 'free blocks' which means that blocks can be aquired with a single-producer, multiple consumer pattern. When there is a need for more memory and existing free-lists are not sufficent, each thread maps its own range from the OS in 4 MB chunks. Allocating from this 'cache miss' is not much slower than allocating stack space and requires no contention. Requests for larger than 4MB are allocated direclty from the OS via mmap. Initial Benchmarks ================== Testing memory allocation systems can be very difficulty and 'artificial tests', are not always the most accurate predictors of real world performance, but I sought to develop a test that would stress the allocation system, particularlly in multi-threaded environments. The test I came up with creates 1 array per thread containing space for 500K allocations. I then assigned each thread the job of randomly allocating empty slots in 1 array and randomly deallocating random slots in another array. The result is a 'random' set of producer-consumer threads. Each allocation was 128 bytes. Future versions of this benchmark will include random sizes as well. | Benchmark | glibc | jemalloc | fc_malloc | |----------------------------|-------------|-------------|-------------| | Random Single Threaded | 5.8s | 4.5s | 2.6s | | Random Multi Threaded (10) | 18.2s | 13.6s | 6.8s | Threads|fcalloc (s)|jemalloc(s)|fcram(MB)|jeram(mb) ---|---|---|---|--- 1|4.8|9.7|97|84.3 2|5.9|14.8|120|104 3|6.5|16.8|145|123 4|7|18|167|142 5|8|18.9|185.5|160 6|8.7|20.3|214.3|189 7|9.9|22.9|238|212 8|11.4|25.2|257|224 9|12.5|26.1|278|244 10|12.9|27.9|308|270 As you can see from the results fc_malloc is over 2x faster than the stock malloc even for single threaded cases. For multi-threaded cases it is 2.6x faster than the stock allocator. The real test though is the comparison to jemalloc which is generally considered one of the highest performing alternative allocators available. Here fc_malloc is still 2x faster in the multi-threaded test. ================================================ FILE: bench.cpp ================================================ #include "fixed_pool.hpp" #include #include #include #include #include #define BENCH_SIZE ( (1024*16*2) ) #define ROUNDS 3000 /* SEQUENTIAL BENCH int main( int argc, char** argv ) { if( argc == 2 && argv[1][0] == 'S' ) { printf( "fp_malloc\n"); for( int i = 0; i < 50000000; ++i ) { char* test = fp_malloc( 128 ); assert( test != nullptr ); test[0] = 1; free2( test ); } } if( argc == 2 && argv[1][0] == 's' ) { printf( "malloc\n"); for( int i = 0; i < 50000000; ++i ) { char* test = (char*)malloc( 128 ); assert( test != nullptr ); test[0] = 1; free( test ); } } fprintf( stderr, "done\n"); // sleep(5); return 0; } */ /* RANDOM BENCH */ std::vector buffers[16]; void pc_bench_worker( int pro, int con, char* (*do_alloc)(int s), void (*do_free)(char*) ) { int64_t total_alloc = 0; int64_t total_free = 0; int64_t total_block_alloc = 0; int64_t total_free_alloc = 0; for( int r = 0; r < ROUNDS; ++r ) { for( size_t x = 0; x < BENCH_SIZE/4 ; ++x ) { uint32_t p = rand() % buffers[pro].size(); if( !buffers[pro][p] ) { uint64_t si = 10000;//16 +rand()%(1024); //4000;//32 + rand() % (1<<16); total_alloc += si; int64_t* r = (int64_t*)do_alloc( si ); // block_header* bh = ((block_header*)r)-1; // assert( bh->size() >= si + 8 ); // fprintf( stderr, "alloc: %p %llu of %llu %u\n", r, si, bh->size(), bh->_size ); assert( r != nullptr ); // assert( r[0] != 99 ); memset( r, 0x00, si ); // r[0] = 99; // total_block_alloc += r[1] = ((block_header*)r)[-1].size(); buffers[pro][p] = r; } } for( size_t x = 0; x < BENCH_SIZE/4 ; ++x ) { uint32_t p = rand() % buffers[con].size(); assert( p < buffers[con].size() ); assert( con < 16 ); assert( con >= 0 ); if( buffers[con][p] ) { // assert( buffers[con][p][0] == 99 ); // buffers[con][p][0] = 0; // total_free += buffers[con][p][0]; // total_free_alloc += buffers[con][p][1]; do_free((char*)buffers[con][p]); buffers[con][p] = nullptr; } } /* fprintf( stderr, "\n Total Alloc: %lld Total Free: %lld Net: %lld\n", total_alloc, total_free, (total_alloc-total_free) ); fprintf( stderr, "\n Total Block Size: %lld Total Free Blocks: %lld Net: %lld\n\n", total_block_alloc, total_free_alloc, (total_block_alloc-total_free_alloc) ); auto needed = (total_alloc-total_free); auto used = (total_block_alloc-total_free_alloc); auto wasted = used - needed; fprintf( stderr, "\n Total Waste: %lld %f\n\n", wasted, double(used)/double(needed) ); */ } } void pc_bench(int n, char* (*do_alloc)(int s), void (*do_free)(char*) ) { for( int i = 0; i < 16; ++i ) { buffers[i].resize( BENCH_SIZE ); memset( buffers[i].data(), 0, 8 * BENCH_SIZE ); } std::thread* a = nullptr; std::thread* b = nullptr; std::thread* c = nullptr; std::thread* d = nullptr; std::thread* e = nullptr; std::thread* f = nullptr; std::thread* g = nullptr; std::thread* h = nullptr; std::thread* i = nullptr; std::thread* j = nullptr; int s = 1; switch( n ) { case 10: a = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 9: b = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 8: c = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 7: d = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 6: e = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 5: f = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 4: g = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 3: h = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 2: i = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 1: j = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); } if(a) a->join(); if(b) b->join(); if(c) c->join(); if(d) d->join(); if(e) e->join(); if(f) f->join(); if(g) g->join(); if(h) h->join(); if(i) i->join(); if(j) j->join(); } void pc_bench_st(char* (*do_alloc)(int s), void (*do_free)(char*) ) { for( int i = 0; i < 16; ++i ) { buffers[i].resize( BENCH_SIZE ); memset( buffers[i].data(), 0, 8 * BENCH_SIZE ); } //int i = 0; pc_bench_worker( 1, 1, do_alloc, do_free ); } //#include char* do_malloc(int s) { return (char*)::malloc(s); // return (char*)scalable_malloc(s); } void do_malloc_free(char* c) { // scalable_free(c); ::free(c); } char* do_fc_malloc(int s) { return (char*)fp_malloc(s); // return (char*)fc_malloc(s); // return (char*)scalable_malloc(s); } void do_fc_free(char* c) { fp_free((void*)c); // scalable_free(c); // fc_free(c); } int main( int argc, char** argv ) { /* char* a = static_heap.alloc32(); char* b = static_heap.alloc32(); char* c = static_heap.alloc32(); fprintf( stderr, "%p %p %p\n", a, b, c ); static_heap.free32(b); char* d = static_heap.alloc32(); fprintf( stderr, "%p %p %p\n", d, b, c ); return 0; */ if( argc > 2 && argv[1][0] == 'm' ) { std::cerr<<"malloc multi\n"; pc_bench( atoi(argv[2]), do_malloc, do_malloc_free ); return 0; } if( argc > 2 && argv[1][0] == 'M' ) { std::cerr<<"hash malloc multi\n"; // pc_bench( atoi(argv[2]), do_fp_malloc, do_fp_free ); pc_bench( atoi(argv[2]), do_fc_malloc, do_fc_free ); return 0; } if( argc > 1 && argv[1][0] == 's' ) { std::cerr<<"malloc single\n"; pc_bench_st( do_malloc, do_malloc_free ); return 0; } if( argc > 1 && argv[1][0] == 'S' ) { std::cerr<<"hash malloc single\n"; pc_bench_st( do_fc_malloc, do_fc_free ); return 0; } std::string line; std::getline( std::cin, line ); std::vector data; while( !std::cin.eof() ) { std::stringstream ss(line); std::string cmd; ss >> cmd; if( cmd == "a" ) // allocate new data { int64_t bytes; ss >> bytes; data.push_back( (char*)fp_malloc( bytes ) ); } if( cmd == "f" ) // free data at index { int64_t idx; ss >> idx; fp_free( data[idx] ); data.erase( data.begin() + idx ); } if( cmd == "c" ) // print cache { // thread_allocator::get().print_cache(); } if( cmd == "p" ) // print heap { } if( cmd == "l" ) // list data { fprintf( stderr, "ID] ptr _size _prev_size\n"); fprintf( stderr, "-----------------------------\n"); for( size_t i = 0; i < data.size(); ++i ) { // block_header* bh = reinterpret_cast(data[i]-8); fprintf( stderr, "%d] %p \n", int(i), data[i]); } } std::getline( std::cin, line ); } return 0; } #if 0 printf( "alloc\n" ); char* tmp = fp_malloc( 61 ); usleep( 1000 ); char* tmp2 = fp_malloc( 134 ); usleep( 1000 ); char* tmp4 = fp_malloc( 899 ); printf( "a %p b %p c %p\n", tmp, tmp2, tmp4 ); usleep( 1000 ); printf( "free\n" ); free2( tmp ); usleep( 1000 ); free2( tmp2 ); usleep( 1000 ); free2( tmp4 ); usleep( 1000*1000 ); printf( "alloc again\n" ); char* tmp1 = fp_malloc( 61 ); usleep( 1000 ); char* tmp3 = fp_malloc( 134 ); usleep( 1000 ); char* tmp5 = fp_malloc( 899 ); printf( "a %p b %p c %p\n", tmp1, tmp3, tmp5 ); free2( tmp1 ); free2( tmp3 ); free2( tmp4 ); usleep( 1000*1000 ); return 0; } #endif ================================================ FILE: bit_index.cpp ================================================ #include "bit_index.hpp" #include int main( int argc, char** argv ) { bit_index<64*64*64> b; b.set_all(); for( int i = 0; i < 66; ++i ) { b.clear(i); assert( !b.get(i) ); fprintf( stderr, "\nI: %d\n", i ); if( i >= 62 ) b.dump(); if( b.first_set_bit() != i+1 ) { exit(1); } } for( int i = 0; i < 66; ++i ) { assert( !b.get(i) ); } assert( b.get(67) ); return 0; fprintf( stderr, "pow64(1) = %d\n", pow64<1>::value ); fprintf( stderr, "pow64(2) = %d\n", pow64<2>::value ); fprintf( stderr, "log64(pow64(2)) = %d\n", log64::value>::value ); fprintf( stderr, "pow64(log64(64*64)) = %d\n", pow64::value>::value ); fprintf( stderr, "pow64(log64(64*64*64)) = %d\n", pow64::value>::value ); fprintf(stderr, "=========== 64 =============\n" ); bit_index<64> _index; fprintf( stderr, "first set bit: %d\n", _index.first_set_bit() ); assert( _index.first_set_bit() == 64 ); _index.set( 34 ); fprintf( stderr, "first set bit: %d\n", _index.first_set_bit() ); assert( _index.get(34) ); assert( _index.first_set_bit() == 34 ); _index.clear(34); assert( !_index.get(34) ); assert( _index.first_set_bit() == 64 ); fprintf(stderr, "=========== 64*64 =============\n" ); bit_index<64*64> _b62; _b62.set(1010); fprintf( stderr, "first set bit: %d\n", _b62.first_set_bit() ); assert( _b62.first_set_bit() == 1010 ); assert( _b62.get(1010) ); assert( _b62.clear(1010) ); assert( !_b62.get(1010) ); fprintf(stderr, "=========== 64*64*64 =============\n" ); bit_index<64*64*64> _b64; fprintf( stderr, "init first bit b64: %d\n", _b64.first_set_bit() ); _b64.set( 660 ); fprintf( stderr, "first set: %d\n", _b64.first_set_bit() ); assert( _b64.get(660) ); _b64.clear(660); fprintf( stderr, "final first bit b64: %d\n", _b64.first_set_bit() ); assert( !_b64.get(660) ); bit_index<64*64*64> _b6464; fprintf( stderr, "SET BIT 66\n" ); _b6464.set( 66 ); fprintf( stderr, "first set 66?? : %d\n", _b6464.first_set_bit() ); fprintf( stderr, "size of %d 64*64*64\n", int(sizeof(_b64) ) ); bit_index<64*64*64*64> _bbb; fprintf( stderr, "size of %d 64*64*64*64 \n\n\n", int(sizeof(_bbb) ) ); _bbb.set(444); assert(_bbb.get(444) ); { bit_index<64*64> _bbb; fprintf( stderr, "size of %d 64*64*64*64 \n\n\n", int(sizeof(_bbb) ) ); _bbb.set(444); assert(_bbb.get(444) ); } /* { bit_index<20*64*64> _bbb; fprintf( stderr, "size of %d 64*64*64*64 \n\n\n", int(sizeof(_bbb) ) ); _bbb.set(444); assert(_bbb.get(444) ); } */ _index.set(3); _index.set(9); _index.set(27); auto itr = _index.at( _index.first_set_bit() ); while( !itr.end() ) { fprintf( stderr, "next bit %lld\n", itr.bit() ); itr.next_set_bit(); } { _b62.set(3); _b62.set(9); _b62.set(270); _b62.set(570); _b62.set(1270); auto itr = _b62.at( _b62.first_set_bit() ); while( !itr.end() ) { fprintf( stderr, "_b62 next bit %lld\n", itr.bit() ); itr.next_set_bit(); } } auto tmp = _bbb.begin(); return 0; } ================================================ FILE: bit_index.hpp ================================================ #pragma once #include #include #include #define LZERO(X) (__builtin_clzll((X)) ) template class bit_index; template struct log64; template<> struct log64<64> { enum { value = 1 }; }; template<> struct log64<0> { enum { value = 0 }; }; template struct log64 { enum { value = 1 + log64::value }; }; template struct pow64; template<> struct pow64<0> { enum ev{ value = 1 }; }; template struct pow64 { enum ev{ value = pow64::value*64ll }; }; template<> class bit_index<1> { public: enum size_enum { index_size = 1 }; void set( uint64_t pos = 0) { assert( pos == 0 ); bit = 1; } bool get( uint32_t pos = 0)const { return bit; } uint64_t& get_bits(uint64_t ) { return bit; } bool clear( uint64_t pos = 0) { assert( pos == 0 ); return !(bit = 0); } void clear_all() { clear(); } void set_all() { set(); } uint64_t first_set_bit()const { return !bit; } uint64_t size()const { return 1; } struct iterator { public: uint64_t& get_bits() { return _self->bit; } bool end()const { return _bit == 1; } int64_t bit()const { return _bit; } void set() { _self->set(_bit); } bool clear() { return _self->clear(_bit); } bool operator*()const { return _self->get(_bit); } iterator& next_set_bit() { _bit = 1; return *this; } iterator( bit_index* s=nullptr, uint8_t b = 64 ):_self(s),_bit(b){} private: bit_index* _self; uint8_t _bit; }; iterator at( uint64_t p ) { return iterator(this, p); } private: uint64_t bit; }; template<> class bit_index<0> : public bit_index<1>{}; template<> class bit_index<64> { public: enum size_enum { index_size = 64 }; bit_index(uint64_t s = 0):_bits(s){} /** * option A: use conditional to check for 0 and return 64 */ uint64_t first_set_bit()const { return _bits == 0 ? 64 : LZERO(_bits); } void dump( int depth ) { for( int i = 0; i < depth; ++i ) fprintf( stderr, " " ); fprintf( stderr, "%llx\n", _bits ); } /** * Option 2, compare + shift + lzr + compare + mult + or... this approach.. while * the result of LZERO(0) is undefined, multiplying it by 0 is defined. * * This code may be faster or slower depending upon this cache miss rate and * the instruction level parallelism. Benchmarks are required. */ //uint64_t first_set_bit()const { return (_bits == 0)<<6 | (LZERO(_bits) * (_bits!=0)); } bool get( uint64_t pos )const { return _bits & (1ll<<(63-pos)); } void set( uint64_t pos ) { assert( pos < 64 ); _bits |= (1ll<<(63-pos)); } bool clear( uint64_t pos ) { // fprintf( stderr, "bit_index<64>::clear %llu\n", pos ); _bits &= ~(1ll<<(63-pos)); //fprintf( stderr, "bit_index<64> clear: %p %llx\n", this, _bits ); //fprintf( stderr, "bit_index<64>::clear %llu return %llu == 0\n", pos, _bits ); return _bits == 0; } uint64_t size()const { return 64; } uint64_t count()const { return __builtin_popcountll(_bits); } void set_all() { _bits = -1; } void clear_all() { _bits = 0; } uint64_t& get_bits( uint64_t bit ) { assert( bit < 64 ); return _bits; } struct iterator { public: uint64_t& get_bits() { return _self->_bits; } bool end()const { return _bit == 64; } int64_t bit()const { return _bit; } void set() { _self->set(_bit); } bool clear() { return _self->clear(_bit); } bool operator*()const { return _self->get(_bit); } iterator& next_set_bit() { ++_bit; if( end() ) return *this; bit_index tmp( (_self->_bits << (_bit))>>(_bit) ); _bit = tmp.first_set_bit(); return *this; } iterator( bit_index* s=nullptr, uint8_t b = 64 ):_self(s),_bit(b){} private: bit_index* _self; uint8_t _bit; }; iterator begin() { return iterator(this,0); } iterator at(uint8_t i){ return iterator(this,i); } iterator end() { return iterator(this,64); } protected: friend class iterator; uint64_t _bits; }; /** * A bit_index is a bitset optimized for searching for set bits. The * operations set and clear maintain higher-level indexes to optimize * finding of set bits. * * The fundamental size is 64 bit and the first set bit can be found * with a single instruction. For indexes up-to 64*64 in size, the * first set bit can be found with 2 clz + 1 compare + 1 mult + 1 add. * */ template class bit_index { public: static_assert( Size >= 64, "smaller sizes not yet supported" ); enum size_enum { index_size = Size, sub_index_size = (Size+63) / 64, sub_index_count = Size / sub_index_size }; static_assert( bit_index::sub_index_count > 0, "array with size 0 is too small" ); static_assert( bit_index::sub_index_count <= 64, "array with size 64 is too big" ); void dump( int depth = 0 ) { _base_index.dump( depth + 1 ); for( int i = 0; i < 3; ++i ) _sub_index[i].dump( depth + 2 ); /** for( int i = 0; i < depth; ++i ) fprintf( stderr, " " ); fprintf( stderr, "%llx\n", _bits ); */ } uint64_t size()const { return index_size; } uint64_t first_set_bit()const { uint64_t base = _base_index.first_set_bit(); if( base >= sub_index_count ) { return Size; } auto subidx = _sub_index[base].first_set_bit(); return base * sub_index_size + subidx; //_sub_index[base].first_set_bit(); } bool get( uint64_t bit )const { assert( bit < Size ); int64_t sub_idx = (bit/sub_index_size); int64_t sub_idx_bit = (bit%sub_index_size); return _sub_index[sub_idx].get( sub_idx_bit ); } void set( uint64_t bit ) { assert( bit < Size ); int64_t sub_idx = (bit/sub_index_size); int64_t sub_idx_bit = (bit%sub_index_size); _base_index.set(sub_idx); return _sub_index[sub_idx].set( sub_idx_bit ); } bool clear( uint64_t bit ) { assert( bit < Size ); int64_t sub_idx = (bit/sub_index_size); int64_t sub_idx_bit = (bit%sub_index_size); if( _sub_index[sub_idx].clear( sub_idx_bit ) ) return _base_index.clear(sub_idx); return false; } void set_all() { _base_index.set_all(); for( uint64_t i = 0; i < sub_index_count; ++i ) { _sub_index[i].set_all(); } } void clear_all() { _base_index.clear_all(); for( uint64_t i = 0; i < sub_index_count; ++i ) { _sub_index[i].clear_all(); } } uint64_t count()const { uint64_t c = 0; for( uint64_t i = 0; i < sub_index_count; ++i ) { c+=_sub_index[i].count(); } return 0; } /** * Returns the in64_t that contains bit */ uint64_t& get_bits( uint64_t bit ) { int64_t sub_idx = (bit/sub_index_size); int64_t sub_idx_bit = (bit%sub_index_size); return _sub_index[sub_idx].get_bits( sub_idx_bit ); } struct iterator { public: uint64_t& get_bits() { return sub_itr.get_bits(); } bool operator*()const { return *sub_itr; } bool end()const { return sub_idx >= sub_index_count; } int64_t bit()const { return pos; } void set() { bit_idx->_base_index.set(sub_idx); sub_itr.set(); } bool clear() { if( sub_itr.clear() ) { return bit_idx->_base_index.clear(sub_idx); } return false; } /** * Find the next bit after this one that is set.. */ iterator& next_set_bit() { if( end() ) return *this; sub_itr.next_set_bit(); if( sub_itr.end() ) { sub_idx = bit_idx->_base_index.at(sub_idx).next_set_bit().bit(); if( end() ) { pos = Size; return *this; } auto fb = bit_idx->_sub_index[sub_idx].first_set_bit(); sub_itr = bit_idx->_sub_index[sub_idx].at(fb); } pos = sub_idx * sub_index_size + sub_itr.bit(); return *this; } /** * Move to the next bit. */ iterator& operator++() { assert( !end() ); ++pos; ++sub_itr; if( sub_itr.end() ) { ++sub_idx; if( !end() ) { sub_itr = bit_idx->_sub_index[sub_idx].begin(); } else pos = Size; } return *this; } iterator& operator++(int) { return this->operator++(); } iterator operator+(uint64_t delta) { return iterator( bit_idx, pos + delta ); } iterator( bit_index* self=nullptr, int64_t bit=Size) :bit_idx(self),pos(bit),sub_idx((bit/64)%64) { sub_itr = bit_idx->_sub_index[sub_idx].at(bit%sub_index_size); } iterator& operator=(const iterator& i ) { bit_idx = i.bit_idx; pos = i.pos; sub_idx = i.sub_idx; sub_itr = i.sub_itr; return *this; } private: friend class bit_index; bit_index* bit_idx; int64_t pos; int8_t sub_idx; typename bit_index::iterator sub_itr; }; iterator begin() { return iterator( this, 0 ); } iterator end() { return iterator( this, Size ); } iterator at(int64_t p) { return iterator( this, p ); } protected: friend class iterator; bit_index<64> _base_index; bit_index _sub_index[sub_index_count]; }; ================================================ FILE: disruptor.hpp ================================================ #pragma once #include #include #include #include #include #include #include namespace disruptor { class eof : public std::exception { public: virtual const char* what()const noexcept { return "eof"; } }; /** * A sequence number must be padded to prevent false sharing and * access to the sequence number must be protected by memory barriers. * * In addition to tracking the sequence number, additional state associated * with the sequence number is also made available. No false sharing * should occur because all 'state' is only written by one thread. This * extra state includes whether or not this sequence number is 'EOF' and * whether or not any alerts have been published. */ class sequence { public: sequence( int64_t v = 0 ):_sequence(v),_alert(0){} int64_t lazy_read()const { return *((volatile int64_t*)&_sequence);}// .load( std::memory_order_acquire); } //volatile int64_t& lazy_write() { return *((volatile int64_t*)&_sequence);}// .load( std::memory_order_acquire); } int64_t aquire()const { return _sequence.load( std::memory_order_acquire); } int64_t aquire_pending()const { return _pending_sequence.load( std::memory_order_acquire); } void lazy_store( int64_t value ) { _sequence.store(value, std::memory_order_relaxed); } void store( int64_t value ) { _sequence.store(value, std::memory_order_release); } void store_pending( int64_t value ) { _pending_sequence.store(value, std::memory_order_release); } void set_eof() { _alert = 1; } void set_alert() { _alert = -1; } bool eof()const { return _alert == 1; } bool alert()const { return _alert != 0; } int64_t atomic_increment_and_get( uint64_t inc ) { return _sequence.fetch_add(inc, std::memory_order::memory_order_release) + inc; } int64_t increment_and_get( uint64_t inc ) { auto tmp = aquire() + inc; store( tmp ); return tmp; } private: std::atomic _sequence; volatile int64_t _alert; std::atomic _pending_sequence; int64_t _post_pad[5]; }; class event_cursor; /** * A barrier will block until all cursors it is following are * have moved past a given position. The barrier uses a * progressive backoff strategy of busy waiting for 1000 * tries, yielding for 1000 tries, and the usleeping in 10 ms * intervals. * * No wait conditions or locks are used because they would * be 'intrusive' to publishers which must check to see whether * or not they must 'notify'. The progressive backoff approach * uses little CPU and is a good compromise for most use cases. */ class barrier { public: void follows( const event_cursor& e ); /** * Used to check how much you can read/write without blocking. * * @return the min position of every cusror this barrier follows. */ int64_t get_min(); /* * This method will wait until all s in seq >= pos using a progressive * backoff of busy wait, yield, and usleep(10*1000) * * @return the minimum value of every dependency */ int64_t wait_for( int64_t pos )const; private: mutable int64_t _last_min; std::vector _limit_seq; }; /** * Provides a automatic index into a ringbuffer with * a power of 2 size. */ template class ring_buffer { public: typedef EventType event_type; static_assert( ((Size != 0) && ((Size & (~Size + 1)) == Size)), "Ring buffer's must be a power of 2" ); /** @return a read-only reference to the event at pos */ const EventType& at( int64_t pos )const { return _buffer[pos & (Size-1)]; } /** @return a reference to the event at pos */ EventType& at( int64_t pos ) { return _buffer[pos & (Size-1)]; } /** useful to check for contiguous ranges when EventType is * POD and memcpy can be used. OR if the buffer is being used * by a socket dumping raw bytes in. In which case memcpy * would have to use to ranges instead of 1. */ int64_t get_buffer_index( int64_t pos )const { return pos & (Size-1); } int64_t get_buffer_size()const { return Size; } private: EventType _buffer[Size]; }; /** * A cursor is used to track the location of a publisher / subscriber within * the ring buffer. Cursors track a range of entries that are waiting * to be processed. After a cursor is 'done' with an entry it can publish * that fact. * * There are two types of cursors, read_cursors and write cursors. read_cursors * block when they need to * * Events between [begin,end) may be processed at will for readers. When a reader * is done they can 'publish' their progress which will move begin up to * published position+1. When begin == end, the cursor must call wait_for(end), * wait_for() will return a new 'end'. * * @section read_cursor_example Read Cursor Example * @code auto source = std::make_shared>(); auto dest = std::make_shared>(); auto p = std::make_shared("write",SIZE); auto a = std::make_shared("a"); a->follows(p); p->follows(a); auto pos = a->begin(); auto end = a->end(); while( true ) { if( pos == end ) { a->publish(pos-1); end = a->wait_for(end); } dest->at(pos) = source->at(pos); ++pos; } * @endcode * * * @section write_cursor_example Write Cursor Example * * The following code would run in the publisher thread. The * publisher can write data without 'waiting' until it pos is * greater than or equal to end. The 'initial condition' of * a publisher is with pos > end because the write cursor * cannot 'be valid' for readers until after the first element * is written. * @code auto pos = p->begin(); auto end = p->end(); while( !done ) { if( pos >= end ) { end = p->wait_for(end); } source->at( pos ) = i; p->publish(pos); ++pos; } // set eof to signal any followers to stop waiting after // they hit this position. p->set_eof(); @endcode * * * */ class event_cursor { public: event_cursor(int64_t b=-1):_name(""),_begin(b),_end(b){} event_cursor(const char* n, int64_t b=0):_name(n),_begin(b),_end(b){} /** this event processor will process every event * upto, but not including s */ void follows( const event_cursor& s ) { _barrier.follows(s); } /** returns one after cursor */ int64_t begin()const { return _begin; } /** returns one after the last ready as of last call to wait_for() */ int64_t end()const { return _end; } /** makes the event at p available to those following this cursor */ void publish( int64_t p ) { check_alert(); _begin = p + 1; _cursor.store( p ); } void lazy_publish( int64_t p ) { _begin = p + 1; _cursor.lazy_store(p); } /** when the cusor hits the end of a stream, it can set the eof flag */ void set_eof(){ _cursor.set_eof(); } /** If an error occurs while processing data the cursor can set an * alert that will be thrown whenever another cursor attempts to wait * on this cursor. */ void set_alert( std::exception_ptr e ) { _alert = std::move(e); _cursor.set_alert(); } /** @return any alert set on this cursor */ const std::exception_ptr& alert()const { return _alert; } /** If an alert has been set, throw! */ inline void check_alert()const; /** the last sequence number this processor has * completed. */ const sequence& pos()const { return _cursor; } sequence& pos(){ return _cursor; } /** used for debug messages */ const char* name()const { return _name; } protected: /** last know available, min(_limit_seq) */ const char* _name; int64_t _begin; int64_t _end; std::exception_ptr _alert; barrier _barrier; sequence _cursor; }; /** * Tracks the read position in a buffer */ class read_cursor : public event_cursor { public: read_cursor(int64_t p=0):event_cursor(p){} read_cursor(const char* n, int64_t p=0):event_cursor(n,p){} /** @return end() which is > pos */ int64_t wait_for( int64_t pos ) { try { return _end = _barrier.wait_for(pos) + 1; } catch ( const eof& ) { _cursor.set_eof(); throw; } catch ( ... ) { set_alert( std::current_exception() ); throw; } } /** find the current end without blocking */ int64_t check_end() { return _end = _barrier.get_min() + 1; } }; class shared_read_cursor : public read_cursor { public: shared_read_cursor(int64_t p=0):read_cursor(p){} shared_read_cursor(const char* n, int64_t p=0):read_cursor(n,p){} /** * This method will block until 'after_pos' is the * current pos, then it will set pos to 'pos' */ void publish_after( int64_t pos, int64_t after_pos ) { try { assert( pos > after_pos ); while( _cursor.aquire() < after_pos ) { // TODO:... this is a spinlock, ease CPU HERE... } // _barrier.wait_for(after_pos); publish( pos ); } catch ( const eof& ) { _cursor.set_eof(); throw; } catch ( ... ) { set_alert( std::current_exception() ); throw; } } bool is_available( int64_t pos ) { return pos <= _barrier.get_min(); } int64_t claim(int64_t num) { auto pos = _claim_cursor.atomic_increment_and_get( num ); return pos - num; } sequence _claim_cursor; }; typedef std::shared_ptr read_cursor_ptr; /** * Tracks the write position in a buffer. * * Write cursors need to know the size of the buffer * in order to know how much space is available. */ class write_cursor : public event_cursor { public: /** @param s - the size of the ringbuffer, * required to do proper wrap detection **/ write_cursor(int64_t s) :_size(s),_size_m1(s-1) { _begin = 0; _end = _size; _cursor.store(-1); } /** * @param n - name of the cursor for debug purposes * @param s - the size of the buffer. */ write_cursor(const char* n, int64_t s) :event_cursor(n),_size(s),_size_m1(s-1) { _begin = 0; _end = _size; _cursor.store(-1); } /** waits for begin() to be valid and then * returns it. This is only safe for * single producers, multi-producers should * use claim(1) instead. */ int64_t wait_next() { wait_for( _begin ); return _begin; } /** * We need to wait until the available space in * the ring buffer is pos - cursor which means that * all readers must be at least to pos - _size and * that our new end is the min of the readers + _size */ int64_t wait_for( int64_t pos ) { try { // throws exception on error, returns 'short' on eof return _end = _barrier.wait_for( pos - _size ) + _size; } catch ( ... ) { set_alert( std::current_exception() ); throw; } } int64_t check_end() { return _end = _barrier.get_min() + _size; } private: const int64_t _size; const int64_t _size_m1; }; typedef std::shared_ptr write_cursor_ptr; /** * When there are multiple writers this cursor can * be used to reserve space in the write buffer * in an atomic manner. * * @code * auto start = cur->claim(slots); * ... do your writes... * cur->publish_after( start + slots, start -1 ); * @endcode * * @todo * An alternative implementation of this would involve * having a sequence number for each thread. A pre-allocated * array of sequence pointers would be initialized to null. * There would be a 'thread-specific' index into this array * that would be allocated by an atomic inc the first time * a new thread attempts to write. Each sequence number * would maintain two sequence numbers: published and * pending. * * To determine the actual 'position' of the write * cursor one would return the MIN( pending ) -1 or * if no sequences are in the 'pending state' the * MAX(published). The pending state is any time * the pending > published. * * The consequence of this approach is that readers * would have to perform more work to determine the end * (reading from all thread positions), the benefit is * that the producers would never have to 'wait' on * each other. * * A variation on this would be to have a fixed * set of producers instead of a dynamic set. This * fixed set would be configured at the start. * * If there is low write-contention then this approach * would probably be poor. */ class shared_write_cursor : public write_cursor { public: /** @param s - the size of the ringbuffer, * required to do proper wrap detection **/ shared_write_cursor(int64_t s) :write_cursor(s){} /** * @param n - name of the cursor for debug purposes * @param s - the size of the buffer. */ shared_write_cursor(const char* n, int64_t s) :write_cursor(n,s){} /** When there are multiple writers they cannot both * assume the right to write to begin() to end(), * instead they must first claim some slots in an * atomic manner. * * * After pos().aquire() == claim( slots ) -1 the claimer * is free to call publish up to start + slots -1 * * @return the first slot the caller may write to. */ int64_t claim( size_t num_slots ) { auto pos = _claim_cursor.atomic_increment_and_get( num_slots ); // std::cerr<<" shared_write: publish "< after_pos ); // std::cerr<<"publish "< shared_write_cursor_ptr; inline void barrier::follows( const event_cursor& e ) { _limit_seq.push_back( &e ); } inline int64_t barrier::get_min() { int64_t min_pos = 0x7fffffffffffffff; for( auto itr = _limit_seq.begin(); itr != _limit_seq.end(); ++itr ) { auto itr_pos = (*itr)->pos().aquire(); if( itr_pos < min_pos ) min_pos = itr_pos; } return _last_min = min_pos; } inline int64_t barrier::wait_for( int64_t pos )const { if( _last_min > pos ) return _last_min; int64_t min_pos = 0x7fffffffffffffff; for( auto itr = _limit_seq.begin(); itr != _limit_seq.end(); ++itr ) { int64_t itr_pos = 0; itr_pos = (*itr)->pos().aquire(); // spin for a bit for( int i = 0; itr_pos < pos && i < 10000; ++i ) { itr_pos = (*itr)->pos().aquire(); if( (*itr)->pos().alert() ) break; } // yield for a while, queue slowing down for( int y = 0; itr_pos < pos && y < 10000; ++y ) { usleep(0); itr_pos = (*itr)->pos().aquire(); if( (*itr)->pos().alert() ) break; } // queue stalled, don't peg the CPU but don't wait // too long either... while( itr_pos < pos ) { usleep( 10*1000 ); itr_pos = (*itr)->pos().aquire(); if( (*itr)->pos().alert() ) break; } if( (*itr)->pos().alert() ) { (*itr)->check_alert(); if( itr_pos > pos ) return itr_pos -1; // process everything up to itr_pos throw eof(); } if( itr_pos < min_pos ) min_pos = itr_pos; } //assert( min_pos != 0x7fffffffffffffff ); return _last_min = min_pos; } inline void event_cursor::check_alert()const { if( _alert != std::exception_ptr() ) std::rethrow_exception( _alert ); } } // namespace disruptor ================================================ FILE: fast_rand.cpp ================================================ #include #include #include #include #include #include #include #ifdef _MSC_VER #pragma intrinsic(__rdtsc) uint64_t get_cc_time () { return __rdtsc(); } #else /* define this somewhere */ #ifdef __i386 __inline__ uint64_t rdtsc() { uint64_t x; __asm__ volatile ("rdtsc" : "=A" (x)); return x; } #elif __amd64 __inline__ uint64_t rdtsc() { uint64_t a, d; __asm__ volatile ("rdtsc" : "=a" (a), "=d" (d)); return a; //(d<<32) | a; } #endif uint64_t get_cc_time () { return rdtsc(); } #endif // Some primes between 2^63 and 2^64 for various uses. // source: CityHash static const uint64_t k0 = 0xc3a5c85c97cb3127ULL; static const uint64_t k1 = 0xb492b66fbe98f273ULL; static const uint64_t k2 = 0x9ae16a3b2f90404fULL; inline uint64_t ShiftMix(uint64_t val) { return val ^ (val >> 47); } uint64_t fast_rand() { int64_t now = rdtsc(); //get_cc_time(); char* s = (char*)&now; // note first 4 bits are 'LSB' on intel... // on bigendian machine we want to add 4 // LSB is most rand, the higher-order bits // will not change much if at all between // calls... const uint8_t a = s[0]; const uint8_t b = s[4 >> 1]; const uint8_t c = s[4 - 1]; const uint32_t y = static_cast(a) + (static_cast(b) << 8); const uint32_t z = 4 + (static_cast(c) << 2); return ShiftMix(y * k2 ^ z * k0) * k2; } ================================================ FILE: fc_heap.hpp ================================================ #pragma once #include "mmap_alloc.hpp" #include #include #include #include #include #include #define CHECK_SIZE( x ) assert(((x) != 0) && !((x) & ((x) - 1))) #define PAGE_SIZE (2*1024*1024) #define LOG2(X) ((unsigned) (8*sizeof (unsigned long long) - __builtin_clzll((X)) - 1)) #define LZERO(X) (__builtin_clzll((X)) ) #define NUM_BINS 32 // log2(PAGE_SIZE) class block_header { public: block_header() :_prev_size(0),_size(-PAGE_SIZE),_flags(0) { //fprintf( stderr, "constructor... size: %d\n", _size ); //memset( data(), 0, size() - 8 ); assert( page_size() == PAGE_SIZE ); } void* operator new (size_t s) { return malloc(PAGE_SIZE);/*mmap_alloc( PAGE_SIZE );*/ } void operator delete( void* p ) { free(p); /*mmap_free( p, PAGE_SIZE );*/ } void dump( const char* label ) { fprintf( stderr, "%s ] _prev_size: %d _size: %d\n", label, _prev_size, _size);//, int(_flags) ); } /** size of the block header including the header, data size is size()-8 */ uint32_t size()const { return abs(_size); } char* data() { return reinterpret_cast(((char*)this)+8); } block_header* next()const { return _size <= 0 ? nullptr : reinterpret_cast(((char*)this)+size()); } block_header* prev()const { return _prev_size <= 0 ? nullptr : reinterpret_cast(((char*)this)-_prev_size); } /** * creates a new block of size S at the end of this block. * * @pre size is a power of 2 * @return a pointer to the new block, or null if no split was possible */ block_header* split( uint32_t sz ) { assert( sz >= 32 ); assert( size() >= 32 ); assert( sz <= (size() - 32) ); assert( page_size() == PAGE_SIZE ); assert( _size != 0xbad ); CHECK_SIZE(sz); int32_t old_size = _size; block_header* old_nxt = next(); _size = size() - sz; assert( _size != 0 ); block_header* nxt = next(); assert( nxt != 0 ); nxt->_prev_size = _size; nxt->_size = old_size < 0 ? -sz : sz; assert( _size != 0 ); if( old_nxt ) old_nxt->_prev_size = nxt->_size; //memset( data(), 0, size()-8 ); assert( size() + nxt->size() == uint32_t(abs(old_size)) ); assert( nxt->next() == old_nxt ); assert( nxt->prev() == this ); assert( next() == nxt ); assert( page_size() == PAGE_SIZE ); assert( nxt->page_size() == PAGE_SIZE ); assert( nxt != this ); nxt->_flags = 0; return nxt; } /** * @return the merged node, if any */ block_header* merge_next() { assert( _size != 0xbad ); block_header* cur_next = next(); if( !cur_next ) return this; assert( cur_next->_size != 0xbad ); assert( cur_next->size() > 0 ); // if( !cur_next->is_idle() ) return this; auto s = size(); assert( _size > 0 ); _size += cur_next->size(); assert( _size != 0 ); if( cur_next->_size > 0 ) { block_header* new_next = next(); new_next->_prev_size = size(); } else { _size = -_size; // we are at the end. assert( _size != 0 ); } assert( cur_next->_size = 0xbad ); // memset( data(), 0, size()-8 ); assert( size() > s ); if( next() ) { assert( size()/8 == next() - this ); assert( next()->_prev_size == size() ); assert( page_size() == PAGE_SIZE ); } return this; } /** * @return the merged node, or this. */ block_header* merge_prev() { assert( page_size() == PAGE_SIZE ); block_header* pre = prev(); if( !pre ) return this; return prev()->merge_next(); } block_header* head() { if( !prev() ) return this; return prev()->head(); } block_header* tail() { if( !next() ) return this; return next()->tail(); } size_t page_size() { auto t = tail(); auto h = head(); return ((char*)t-(char*)h) + t->size(); } struct queue_state // the block is serving as a linked-list node { block_header* qnext; block_header* qprev; block_header** head; block_header** tail; }; enum flag_enum { queued = 1, idle = 2, active = 4 }; bool is_idle()const { return _flags & idle; } bool is_active()const { return _flags & active; } bool is_queued()const { return _flags & queued; } void set_active( bool s ) { if( s ) _flags |= active; else _flags &= ~active; } void set_queued( bool s ) { if( s ) _flags |= queued; else _flags &= ~queued; // anytime we change state it should be reset.. if( is_queued() ) { as_queue().qnext = nullptr; as_queue().qprev = nullptr; } } /** removes this node from any queue it is in */ void dequeue() { block_header* pre = as_queue().qprev; block_header* nxt = as_queue().qnext; if( pre ) pre->as_queue().qnext = nxt; if( nxt ) nxt->as_queue().qprev = pre; set_queued(false); } void set_idle( bool s ) { if( s ) _flags |= idle; else _flags &= ~idle; assert( is_idle() == s ); } queue_state& as_queue() { // assert( is_queued() ); return *reinterpret_cast(data()); } // private: int32_t _prev_size; // size of previous header. int32_t _size:24; // offset to next, negitive indicates tail, 8 MB max, it could be neg int32_t _flags:8; // offset to next, negitive indicates tail }; static_assert( sizeof(block_header) == 8, "Compiler is not packing data" ); typedef block_header* block_header_ptr; struct block_stack { public: block_stack():_head(nullptr){} void push( block_header* h ) { h->as_queue().qnext = _head; if( _head ) _head->as_queue().qprev = h; _head = h; //_head.push_back(h); } void push_all( block_header* h ) { assert( h->is_queued() ); assert( _head == nullptr ); _head = h; } /* bool pop( block_header* h ) { if( _head == nullptr ) return null; return _head.erase(h) != 0; } */ /** returns all blocks */ block_header* pop_all() { block_header* h = _head; _head = nullptr; return h; } block_header* pop() { if( _head ) { auto tmp = _head; _head = _head->as_queue().qnext; if( _head ) _head->as_queue().qprev = nullptr; return tmp; } return nullptr; /* if( _head.size() == 0 ) return nullptr; auto f = _head.begin(); auto h = *f; _head.erase(f); return h; */ } block_header* head(){ return _head; } //int size() { return int(_head.size()); } private: //std::unordered_set _head; block_header* _head; }; /** * Single threaded heap implementation, foundation * for multi-threaded version; */ class fc_heap { public: block_header* alloc( size_t s ); void free( block_header* h ); fc_heap() { memset(_bins, 0, sizeof(_bins) ); _free_32_data = mmap_alloc( PAGE_SIZE ); _free_64_data = mmap_alloc( PAGE_SIZE ); _free_32_data_end = _free_32_data + PAGE_SIZE; _free_64_data_end = _free_64_data + PAGE_SIZE; _free_32_scan_end = &_free_32_state[PAGE_SIZE/32/64]; _free_64_scan_end = &_free_64_state[PAGE_SIZE/64/64]; _free_32_scan_pos = _free_32_state; _free_64_scan_pos = _free_64_state; memset( _free_32_state, 0xff, sizeof(_free_32_state ) ); memset( _free_64_state, 0xff, sizeof(_free_64_state ) ); } ~fc_heap() { mmap_free( _free_64_data, PAGE_SIZE ); mmap_free( _free_32_data, PAGE_SIZE ); } // private: char* alloc32() { uint32_t c = 0; while( 0 == *_free_32_scan_pos ) { ++_free_32_scan_pos; if( _free_32_scan_pos == _free_32_scan_end ) { _free_32_scan_pos = _free_32_state; } if( ++c == sizeof(_free_32_state)/sizeof(int64_t) ) { return alloc64(); } } int bit = LZERO(*_free_32_scan_pos); int offset = (_free_32_scan_pos - _free_32_state)*64; *_free_32_scan_pos ^= (1ll<<(63-bit)); // flip the bit // fprintf( stderr, "alloc offset: %d bit %d pos %d\n", offset,bit,(offset+bit) ); return _free_32_data + (offset+bit)*32; } char* alloc64() { uint32_t c = 0; while( 0 == *_free_64_scan_pos ) { ++_free_64_scan_pos; if( _free_64_scan_pos == _free_64_scan_end ) { _free_64_scan_pos = _free_64_state; } if( ++c == sizeof(_free_64_state)/sizeof(int64_t) ) { return nullptr; } } int bit = LZERO(*_free_64_scan_pos); int offset = (_free_64_scan_pos - _free_64_state)*64; *_free_64_scan_pos ^= (1ll<<(63-bit)); // flip the bit return _free_64_data + (offset+bit)*64; } bool free32( char* p ) { if( p >= _free_32_data && _free_32_data_end > p ) { uint32_t offset = (p - _free_32_data)/32; uint32_t bit = offset & (64-1); uint32_t idx = offset/64; _free_32_state[idx] ^= (1ll<<((63-bit))); return true; } return false; } bool free64( char* p ) { if( p >= _free_64_data && _free_64_data_end > p ) { uint32_t offset = (p - _free_64_data)/64; uint32_t bit = offset & (64-1); uint32_t idx = offset/64; _free_64_state[idx] ^= (1ll<<((63-bit))); return true; } return false; } char* _free_32_data; char* _free_64_data; char* _free_32_data_end; char* _free_64_data_end; uint64_t* _free_32_scan_pos; uint64_t* _free_64_scan_pos; uint64_t* _free_32_scan_end; uint64_t* _free_64_scan_end; uint64_t _free_32_state[PAGE_SIZE/32/64]; uint64_t _free_64_state[PAGE_SIZE/64/64]; block_stack _bins[NUM_BINS]; // anything less than 1024 bytes }; /** * Return a block of size s or greater * @pre size >= 32 * @pre size is power of 2 */ block_header* fc_heap::alloc( size_t s ) { assert( s >= 32 ); CHECK_SIZE( s ); // make sure it is a power of 2 uint32_t min_bin = LOG2(s); // find the min bin for it. while( min_bin < 32 ) { block_header* h = _bins[min_bin].pop(); if( h ) { assert( h->_size != 0 ); assert( h->_size != 0xbad ); assert( h->is_queued() ); h->set_queued(false); if( h->size() - 32 < s ) { h->set_active(true); return h; } block_header* tail = h->split(s); assert( h->_size != 0 ); h->set_active(true); this->free(h); tail->set_active(true); return tail; } ++min_bin; } // mmap a new page block_header* h = new block_header(); block_header* t = h->split(s); h->set_active(true); free(h); t->set_active(true); return t; } void fc_heap::free( block_header* h ) { assert( h != nullptr ); assert( h->is_active() ); assert( h->_size != 0 ); assert( h->size() < PAGE_SIZE ); auto pre = h->prev(); auto nxt = h->next(); if( nxt && !nxt->is_active() && nxt->is_queued() ) { auto nxt_bin = LOG2(nxt->size()); if( _bins[nxt_bin].head() == nxt ) { _bins[nxt_bin].pop(); nxt->set_queued(false); } else { nxt->dequeue(); } h = h->merge_next(); } if( pre && !pre->is_active() && pre->is_queued() ) { auto pre_bin = LOG2(pre->size()); if( _bins[pre_bin].head() == pre ) { _bins[pre_bin].pop(); pre->set_queued(false); } else { pre->dequeue(); } h = pre->merge_next(); } if( h->size() == PAGE_SIZE ) { delete h; return; } h->set_active(false); h->set_queued(true ); auto hbin = LOG2(h->size()); _bins[hbin].push(h); } class thread_heap; class garbage_thread { public: static garbage_thread& get(); uint64_t avail( int bin ); int64_t claim( int bin, int64_t num ); block_header* get_claim( int bin, int64_t pos ); protected: void register_thread_heap( thread_heap* h ); friend class thread_heap; static void run(); }; class thread_heap { public: static thread_heap& get(); block_header* allocate( size_t s ) { if( s >= PAGE_SIZE ) { // TODO: allocate special mmap region... } uint32_t min_bin = LOG2(s); // find the min bin for it. while( min_bin < NUM_BINS ) { block_header* h = cache_alloc(min_bin, s); if( h ) return h; garbage_thread& gc = garbage_thread::get(); if( auto av = gc.avail( min_bin ) ) { int64_t claim_num = std::min(4,av); int64_t claim = gc.claim( min_bin, claim_num ); int64_t end = claim + claim_num; while( claim < end ) { block_header* h = gc.get_claim(min_bin,claim); if( h ) { cache(h); } ++claim; } h = cache_alloc(min_bin, s); if( h ) return h; // else... we actually didn't get our claim } ++min_bin; } block_header* h = new block_header(); h->set_active(true); if( s <= PAGE_SIZE - 32 ) { block_header* t = h->split(s); t->set_active(true); cache( h ); return t; } return h; } block_header* cache_alloc( int bin, size_t s ) { block_header* c = pop_cache(bin); if( c && (c->size() - 32) > s ) { block_header* t = c->split(s); c->set_active(true); if( !cache( c ) ) { this->free(c); } t->set_active(true); return t; } return nullptr; } bool cache( block_header* h ) { uint32_t b = LOG2( h->size() ); if( _cache_size[b] < 4 ) { h->set_queued(true); _cache[b].push(h); _cache_size[b]++; return true; } return false; } block_header* pop_cache( int bin ) { block_header* h = _cache[bin].pop(); if( h ) { _cache_size[bin]--; h->set_queued(false); return h; } return nullptr; } void free( block_header* h ) { h->set_queued(true); _gc_on_deck.push( h ); if( !_gc_at_bat.head() ) _gc_at_bat.push_all( _gc_on_deck.pop_all() ); } private: thread_heap(); friend garbage_thread; block_stack _gc_at_bat; // waiting for gc to empty block_stack _gc_on_deck; // caching until gc pickups at bat block_stack _cache[NUM_BINS]; int16_t _cache_size[NUM_BINS]; }; static fc_heap static_heap; void* fc_malloc( size_t s ) { if( s <= 64 ) { if( s <= 32 ) return static_heap.alloc32(); else return static_heap.alloc64(); } // round up to nearest power of 2 > 32 s += 8; // room for header. if( s < 32 ) s = 32; // min size s = (1<<(LOG2(s-1)+1)); // round up to nearest power of 2 if( s < 24 ) s = 24; block_header* h = static_heap.alloc( s ); assert( h->is_active() ); // h->set_idle(false); // assert( h->page_size() == PAGE_SIZE ); return h->data(); } void fc_free( void* f ) { if( static_heap.free32((char*)f) || static_heap.free64((char*)f) ) return; block_header* bh = (block_header*)(((char*)f)-8); // fprintf( stderr, "fc_free(block: %p)\n", bh ); // assert( bh->is_active() ); //assert( bh->page_size() == PAGE_SIZE ); static_heap.free(bh); } ================================================ FILE: fc_malloc.cpp ================================================ /* pool<24> p24; pool<58> p58; pool<120> p120; pool<248> p248; pool<504> p504; pool<1016> p1016; pool<2040> p2040; pool<4088> p4088; */ void* fc_malloc( size_t s ) { #define TRY_POOL(I,X,S) if( len < X ) return pool::alloc(); TRY_POOL(1,24,256); TRY_POOL(2,58,256); TRY_POOL(3,120,256); TRY_POOL(4,248,128); TRY_POOL(5,504,128); TRY_POOL(6,1016,128); TRY_POOL(7,2040,64); TRY_POOL(8,4088,64); TRY_POOL(9,8184,64); if( len < 64*1024 ) { } if( len < 1024*1024 ) { } else { uint64_t* m = malloc( s+8); *m = -1; return m+1; } } free( void* f ) { } ================================================ FILE: fc_malloc.h ================================================ void* fc_malloc( size_t s ); free( void* f ); ================================================ FILE: fixed_pool.hpp ================================================ #include #include #include "mmap_alloc.hpp" #include "bit_index.hpp" #define GB (1024LL*1014LL*1024LL) #define MB (1024LL*1024LL) #define LOG2(X) ((unsigned) (8*sizeof (unsigned long long) - __builtin_clzll((X)) - 1)) class basic_page { public: basic_page():_next_page(nullptr){} virtual ~basic_page(){} virtual void release() = 0; virtual void* alloc() = 0; virtual void free( void* ) = 0; virtual int get_page_pos() = 0; virtual int get_pool() = 0; virtual int64_t get_available()const = 0; basic_page* _next_page; // virtual void item_size()const = 0; }; typedef basic_page* basic_page_ptr; class basic_pool { public: virtual ~basic_pool(){} virtual basic_page* claim_page() = 0; virtual bool gc_free(void*) = 0; virtual void gc_release( basic_page_ptr p ) = 0; }; typedef basic_pool* basic_pool_ptr; struct free_node { free_node* next; }; template class fixed_pool : public basic_pool { public: class page : public basic_page { public: page( int64_t claim_pos ) { fprintf( stderr, "CLAIM POS %lld\n", claim_pos ); _data = (char*)mmap_alloc( PageSize, (void*)((ItemSize << 32) + claim_pos * PageSize) ); fprintf( stderr, " PAGE DATA: %p\n", _data ); assert( (int64_t(_data) >> 32) == ItemSize ); _next_data = _data; _page_end = _data + PageSize; _alloc_free = nullptr; _gc_free_at_bat = nullptr; _gc_free_on_deck = nullptr; _claim_pos = claim_pos; _alloc = 0; _free = 0; } int _claim_pos; virtual int get_page_pos() { return _claim_pos; } int get_pool() { return LOG2(ItemSize)-4; } ~page() { mmap_free( _data, PageSize ); } void* alloc() { if( _gc_free_at_bat ) { fprintf( stderr, "%p _gc_free_at_bat page pos %d\n", this, _claim_pos ); free_node* gc = _gc_free_at_bat; _gc_free_at_bat = nullptr; while( gc ) { free_node* n = gc->next; gc->next = _alloc_free; _alloc_free = gc; gc = n; } } if( _alloc_free ) { free_node* n = _alloc_free; _alloc_free = n->next; ++_alloc; return n; } else if( _next_data != _page_end ) { char* n = _next_data; _next_data += ItemSize; assert( n < _page_end ); ++_alloc; return n; } else { fprintf( stderr, "_next_data == _page_end\n" ); return nullptr; } } int64_t get_available()const { return PageSize/ItemSize - _alloc + _free; //_avail; } void free( void* c ) { assert( c > _data && c < _page_end ); free_node* n = (free_node*)c; n->next = _alloc_free; _alloc_free = n; } void gc_free( void* c ) { //fprintf( stderr, "gc_free(%p) _data %p _end %p\n", c, _data, _page_end ); assert( c >= _data && c < _page_end ); free_node* n = (free_node*)c; n->next = _gc_free_on_deck; _gc_free_on_deck = n; if( !_gc_free_at_bat ) { _gc_free_at_bat = _gc_free_on_deck; _gc_free_on_deck = nullptr; } ++_free; } bool is_claimed()const { return 0 != _claim.load(std::memory_order_relaxed); } bool claim() { return 0 == _claim.fetch_add(1); } void release() { _claim.store(0); } protected: friend class thread_local_heap; friend class fixed_pool; int64_t _alloc; // count managed by alloc thread int64_t _free; // count managed by the gc thread std::atomic _claim; // when 0 no one owns this page, first person to inc owns the page. free_node* _alloc_free; // free list managed by alloc thread free_node* _gc_free_at_bat; free_node* _gc_free_on_deck; char* _data; char* _page_end; char* _next_data; }; // class page /** * Grab the next page with free space or allocate on * if necessary. This method may be called from any * thread. */ virtual basic_page* claim_page() { auto rp = _pending_read_pos.load( std::memory_order_relaxed ); auto wp = _pending_write_pos.load( std::memory_order_relaxed ); if( rp <= wp ) { int64_t claim = _pending_read_pos.fetch_add(1); if( claim <= wp ) { basic_page* p = _pending_pages[claim%32]; _pending_pages[claim%32] = 0; if( p ) { fprintf( stderr, "claiming pending page %p \n", p);//, p->get_page_pos() ); return p; } else { fprintf( stderr, "pending pages[claim] == null\n" ); } } } int64_t claim = _next_page.fetch_add(1); page* p = new page(claim); fprintf( stderr, "alloc new page pending page %p %d\n", p, p->get_page_pos() ); //p->claim(); _pages[claim] = p; return p; } virtual bool gc_free( void* v ) { int64_t byte_pos = (int64_t(v)<<32)>>32; int64_t page_num = byte_pos/(PageSize); auto pg = _pages[page_num]; fprintf( stderr, "page_num %lld %p\n", page_num, v ); assert( pg ); if( pg ) { pg->gc_free(v); return true; } return false; } virtual void gc_release( basic_page_ptr p ) { _free_pages.set( p->get_page_pos() ); auto rp = _pending_read_pos.load(std::memory_order_relaxed); auto wp = _pending_write_pos.load(std::memory_order_relaxed); while( rp > wp - 31 ) { ++wp; auto pos = wp%32; if( _pending_pages[pos] == nullptr ) { int b = _free_pages.first_set_bit(); if( _pages[b] && _pages[b]->get_available() ) { _free_pages.clear(b); fprintf( stderr, "pending_pages[%lld] = %p\n", pos, _pages[b] ); _pending_pages[ pos ] = _pages[b]; } if( !_pages[b] ){ --wp; break; } } } _pending_write_pos.store(wp); } fixed_pool() :_pending_read_pos(0),_pending_write_pos(-1) { _free_pages.set_all(); memset( _pages, 0, sizeof(_pages) ); memset( _pending_pages, 0, sizeof(_pending_pages) ); } typedef page* page_ptr; std::atomic _next_page; // inc to allocate a new page. std::atomic _pending_read_pos; std::atomic _pending_write_pos; page_ptr _pending_pages[32]; // updated by gc thread... 'unclaimed pages' with free data. bit_index<64*64/*2*GB/PageSize*/> _free_pages; page_ptr _pages[2*GB/PageSize]; }; class thread_local_heap; class garbage_collector { public: garbage_collector() :_done(false), _tlheaps(nullptr), _gc_thread(&garbage_collector::run){} ~garbage_collector() { _done.store(true); _gc_thread.join(); } void register_thread_local_heap( thread_local_heap* t ); static garbage_collector& get() { static garbage_collector gc; return gc; } static void run(); private: std::atomic _done; std::atomic _tlheaps; std::thread _gc_thread; }; static basic_pool_ptr get_pool( int p ) { if( !(p >= 0 && p < 16 ) ) fprintf( stderr, "%d", p ); assert( (p >= 0 && p < 16 ) ); static basic_pool_ptr _pools[16]; static bool _init = [&]()->bool{ // allocate the pools for all size classes _pools[0] = new fixed_pool<16>(); _pools[1] = new fixed_pool<32>(); _pools[2] = new fixed_pool<64>(); _pools[3] = new fixed_pool<128>(); _pools[4] = new fixed_pool<256>(); _pools[5] = new fixed_pool<512>(); _pools[6] = new fixed_pool<1024>(); _pools[7] = new fixed_pool<2*1024>(); _pools[8] = new fixed_pool<4*1024>(); _pools[9] = new fixed_pool<8*1024>(); _pools[10] = new fixed_pool<16*1024>(); _pools[11] = new fixed_pool<32*1024>(); _pools[12] = new fixed_pool<64*1024>(); _pools[13] = new fixed_pool<128*1024>(); _pools[14] = new fixed_pool<256*1024>(); _pools[15] = new fixed_pool<512*1024>(); return true; }(); (void)_init; // unused warning return _pools[p]; } class thread_local_heap { public: thread_local_heap() :_gc_at_bat(nullptr), _release_at_bat(nullptr), _gc_on_deck(nullptr), _release_on_deck(nullptr) { garbage_collector::get().register_thread_local_heap(this); } ~thread_local_heap() { } static thread_local_heap& get() { static __thread thread_local_heap* tlh = nullptr; if( !tlh ) tlh = new thread_local_heap(); return *tlh; } void* alloc( size_t s ) { int32_t pool = LOG2(s-1) + 1 - 4; // fprintf( stderr, "pool %d for size %d\n", pool, int(s) ); if( !_pages[pool] ) { basic_page_ptr p = get_pool(pool)->claim_page(); fprintf( stderr, "claim pool! %p\n", p ); assert(p); _pages[pool] = p; auto r = p->alloc(); assert(r); return r; } void* a = _pages[pool]->alloc(); if( !a ) // the page must be full... release it and get a new one { fprintf( stderr, "release pool %d %p\n", pool, _pages[pool] ); basic_page_ptr p = get_pool(pool)->claim_page(); assert( p ); fprintf( stderr, "new page %p avail: %lld\n", p, p->get_available() ); _pages[pool]->_next_page = _release_on_deck; _release_on_deck = _pages[pool]; if( _release_at_bat == nullptr ) { _release_at_bat = _release_on_deck; _release_on_deck = nullptr; } _pages[pool] = p; assert(p); auto r = p->alloc(); assert(r); return r; } assert( a ); return a; } void free( void* v ) { assert( v != nullptr ); // fprintf( stderr, "free %p tld: %p\n", v, this ); // size_t s = int64_t(v)>>32; // int32_t pool = LOG2(s) - 4; // fprintf( stderr, "Free size: %llu on pool %d\n", s, pool ); // try local free first. // if( _pages[pool] && _pages[pool]->free(v) ) // return; free_node* fv = (free_node*)v; assert( fv != _gc_on_deck ); fv->next = _gc_on_deck; _gc_on_deck = fv; if( _gc_at_bat == nullptr ) { _gc_at_bat = _gc_on_deck; _gc_on_deck = nullptr; } } private: friend class garbage_collector; free_node* _gc_at_bat; basic_page_ptr _release_at_bat; uint64_t _gc_pad[7]; free_node* _gc_on_deck; basic_page_ptr _release_on_deck; // current page for this thread... basic_page_ptr _pages[32]; // sized every power of 2 up to 1MB thread_local_heap* _next; }; void garbage_collector::register_thread_local_heap( thread_local_heap* t ) { auto* stale_head = _tlheaps.load(std::memory_order_relaxed); do { t->_next = stale_head; }while( !_tlheaps.compare_exchange_weak( stale_head, t, std::memory_order_release ) ); } void garbage_collector::run() { garbage_collector& gc = garbage_collector::get(); while( true ) { bool found_work = false; thread_local_heap* cur = gc._tlheaps.load( std::memory_order_relaxed ); while( cur ) { free_node* n = cur->_gc_at_bat; if( n ) { cur->_gc_at_bat = nullptr; found_work = true; } while( n ) { auto next = n->next; // TODO: free N int pool = LOG2( int64_t(n) >> 32 ) - 4; // fprintf( stderr, "pool %d gc_free %p\n", pool, n ); get_pool( pool )->gc_free(n); //fprintf( stderr, "." ); assert( n != next ); n = next; } if( cur->_release_at_bat != nullptr ) { basic_page_ptr p = cur->_release_at_bat; cur->_release_at_bat = nullptr; while( p ) { p->release(); int pool = p->get_pool(); //LOG2( int64_t(p) >> 32 ) - 4; get_pool( pool )->gc_release(p); p = p->_next_page; } } assert( cur != cur->_next ); cur = cur->_next; } if( !found_work ) { // TODO: replace with something better.. ::usleep( 100 ); if( gc._done.load() ) return; } } } void* fp_malloc( size_t s ) { return thread_local_heap::get().alloc(s); } void fp_free( void* v ) { thread_local_heap::get().free(v); } ================================================ FILE: garbage_collector.hpp ================================================ ================================================ FILE: hheap.cpp ================================================ #include #include #include #include #include #include #include #include #include #include std::mutex print_mutex; #include "disruptor.hpp" using namespace disruptor; #if 0 #define PRINT( ... ) \ { std::unique_lock _lock(print_mutex); \ __VA_ARGS__ \ } #define NEW_PRINT( ... ) \ { std::unique_lock _lock(print_mutex); \ __VA_ARGS__ \ } #define PAGE_FREE_PRINT( ... ) \ { std::unique_lock _lock(print_mutex); \ __VA_ARGS__ \ } #else #define PRINT(...) #define NEW_PRINT(...) #define PAGE_FREE_PRINT(...) #endif int64_t fast_rand(); struct slot_header { int32_t page_id; // used by free to find the page in the pool int16_t pool_id; // used by free to find the pool uint8_t page_slot; // the slot in the page in the pool uint8_t alignment; // 8 if reserved, 0 if free... byte _data[alignment-1] = alignment. }; template struct page { public: struct slot { int32_t page_id; // used by free to find the page in the pool int16_t pool_id; // used by free to find the pool uint8_t page_slot; // the slot in the page in the pool uint8_t alignment; // 8 if reserved, 0 if free... byte _data[alignment-1] = alignment. char _data[Size]; // alignment helps us find the page_id/pool_id when allocated aligned objects. }; page(int16_t page_id, int16_t pool_id) :_free_write_cursor(NumSlots) { _pool_id = pool_id; _page_id = page_id; _posted = false; // ... _free_write_cursor.follows( _free_read_cursor ); _free_read_cursor.follows( _free_write_cursor ); for( int i = 0; i < NumSlots; ++i ) { slot& s = _slot[i]; s.page_id = page_id; s.pool_id = pool_id; s.page_slot = i; s.alignment = 8; // free expects this this->free(i); // increment the free write cursor } _release_free_pos = 0; assert( free_estimate() == NumSlots ); assert( can_alloc() ); } int32_t free_estimate() { if( _release_free_pos < 0 ) return 0; return _free_write_cursor.begin() - _release_free_pos; } bool can_alloc() { if( _free_read_cursor.begin() == _free_read_cursor.end() && _free_read_cursor.begin() == _free_read_cursor.check_end() ) { // std::cerr<<" CAN ALLOC? page: "<<_page_id<<" free read cursor begin: "<<_free_read_cursor.begin()<<" end: "<<_free_read_cursor.end()<<"\n"; return false; } return true; } char* alloc(uint8_t align = 8) { if( !can_alloc() ) return nullptr; auto pos = _free_read_cursor.begin(); int64_t free_slot = _free_list.at(pos); _free_read_cursor.publish( pos ); // std::cerr<<"page: "<<_page_id<<" alloc slot: "<= 8 ); assert( _slot[slot].pool_id == _pool_id ); _slot[slot].alignment = 0; // last thing we do is set alignment. auto cl = _free_write_cursor.claim(1); _free_list.at(cl) = slot; //_free_write_cursor.publish_after( cl, cl - 1 ); _free_write_cursor.publish( cl );//, cl - 1 ); return free_estimate(); return 0; } /** called to save the free cursor position so we can track how many * slots have been freed since this thread gave up control */ void release() { _posted = false; _free_claim.store(0,std::memory_order_relaxed); _release_free_pos = _free_write_cursor.begin(); } void claim() { _release_free_pos = -1; } bool claim_free() { if( !_posted && 0 == _free_claim.fetch_add(1, std::memory_order_release ) ) { return _posted = true; } return false; } bool is_posted_to_free_list(){ return _posted; } private: slot _slot[NumSlots]; // actual data storage /** the position of the free_write_cursor at the time this page was 'released' * by the last allocator thread. **/ int64_t _release_free_pos; ring_buffer _free_list; shared_write_cursor _free_write_cursor; read_cursor _free_read_cursor; uint32_t _pool_id; uint32_t _page_id; bool _posted; std::atomic _free_claim; }; /** * A pool is a collection of 'pages' that threads can claim to use * for allocation. * */ template struct pool { typedef page page_type; typedef page_type* page_ptr; typedef typename page_type::slot slot_type; typedef slot_type* slot_ptr; struct thread_local_data { thread_local_data() :current_page_num(-1), current_page(nullptr){} int32_t current_page_num; page_ptr current_page; }; ring_buffer _free_pages; // indexes into _alloc_pages shared_write_cursor _free_page_write_cursor; shared_read_cursor _free_page_read_cursor; ring_buffer _alloc_pages; // pages allocated (fixed index) shared_write_cursor _page_alloc_cursor; const read_cursor _page_alloc_begin; // used to prevent alloc_cursor from wrapping pool() :_free_page_write_cursor( MaxPages ), _free_page_read_cursor( MaxPages ), _page_alloc_cursor( MaxPages ) { _free_page_write_cursor.follows( _free_page_read_cursor ); _free_page_read_cursor.follows( _free_page_write_cursor ); // _page_alloc_cursor.follows( _page_alloc_begin ); //_page_alloc_begin.follows( _page_alloc_cursor ); // begin shouldn't move } static pool& instance() { static pool _p; return _p; } static thread_local_data*& local_pool() { static thread_local thread_local_data* _current = nullptr; return _current; } thread_local_data& get_local_pool() { thread_local_data*& cur = local_pool(); if( cur == nullptr ) { cur = new thread_local_data(); } return *cur; } char* do_alloc( uint16_t align = 8 ) { thread_local_data& tld = get_local_pool(); //get thread local data if( tld.current_page_num == -1 ) // we need to claim a page { claim_page(tld); assert( tld.current_page_num != -1 ); assert( tld.current_page ); } char* c = tld.current_page->alloc(align); while( !c ) // no space available, claim a new page { claim_page(tld); c = tld.current_page->alloc(align); if( !c ) { std::cerr<<"!!?? NULL??\n"; } } return c; } void do_free( char* c ) { uint8_t* s = reinterpret_cast(c); assert( c != nullptr ); assert( s[-1] == 8 ); uint8_t* slot_pos = (uint8_t*)c-8;//s + s[-1]-16; // s-1 == alignment, default 8 byte slot_ptr sl = reinterpret_cast(slot_pos); assert( sl->pool_id == PoolId ); assert( sl->page_slot < SlotsPerPage ); assert( sl->page_id < MaxPages ); auto p = _alloc_pages.at(sl->page_id); if( p->free(sl->page_slot) > SlotsPerPage/4 ) { if( !p->claim_free() ) return; // do I get to post this.. or does someone else.. // move page into free queue auto claim = _free_page_write_cursor.claim(1); _free_pages.at(claim) = sl->page_id; PAGE_FREE_PRINT(std::cerr<<"PAGE AVAILABLE: "<page_id<<"\n"; std::cerr<<" sl->pool_id: "<pool_id)<<" slot: "<page_slot)<<" id: "<page_id)<<" SlotsPerPage: "<free_estimate()<<" \n"; std::cerr<<" free_page_write claim: "<release(); auto read_claim = _free_page_read_cursor.claim(1); if( !_free_page_read_cursor.is_available( read_claim ) ) { NEW_PRINT(std::cerr<<"NEW PAGE: free_read_claim_idx: "<free_estimate()<<"\n"; ) } tld.current_page->claim(); } static void free( char* c ) { instance().do_free(c); }; static char* alloc( uint16_t align = 8 ) { return instance().do_alloc(align); }; }; #define BENCH_SIZE ( (1024*256) ) #define ROUNDS 100 //#define BENCH_SIZE ( (512) ) //#define ROUNDS 5 #include void malloc_bench( int tid ) { std::vector a(BENCH_SIZE); memset( a.data(), 0, a.size() * sizeof(char*)); for( int x = 0; x < ROUNDS; ++x ) { for( int i = 0; i < BENCH_SIZE; ++i ) { int pos = rand() & 1; if( a[i] && pos ) { free(a[i]); a[i]=0; } else if( !a[i] && pos ) { a[i] = (char*)malloc(64); } } } } void bench(int tid) { std::vector a(BENCH_SIZE); memset( a.data(), 0, a.size() * sizeof(char*)); for( int x = 0; x < ROUNDS; ++x ) { for( int i = 0; i < BENCH_SIZE; ++i ) { int pos = rand() & 1; if( a[i] && pos ) { pool<1,64,256>::free(a[i]); a[i] = 0;//free(a[i]); } else if( !a[i] && pos ) { a[i] = pool<1,64,256>::alloc(); } } } } std::vector buffers[16]; void pc_bench_worker( int pro, int con, char* (*do_alloc)(int s), void (*do_free)(char*) ) { for( int r = 0; r < ROUNDS; ++r ) { for( int x = 0; x < buffers[pro].size()/2 ; ++x ) { int p = fast_rand() % buffers[pro].size(); if( !buffers[pro][p] ) { auto si = 60; //fast_rand() % (1<<15); auto r = do_alloc( si ); slot_header* sh = (slot_header*)(r-8);// TODO: handle alignment //assert( sh->alignment == 8 ); //assert( sh->pool_id > 3 ); if( r == nullptr ) { std::cerr<<"size: "<::alloc(); case 15: return pool<15,1<<15,16>::alloc(); case 14: return pool<14,1<<14,32>::alloc(); case 13: return pool<13,1<<13,64>::alloc(); case 12: return pool<12,1<<12,64>::alloc(); case 11: return pool<11,1<<11,64>::alloc(); case 10: return pool<10,1<<10,128>::alloc(); case 9: return pool<9,1<<9,128>::alloc(); case 8: return pool<8,1<<8,128>::alloc(); case 7: return pool<7,1<<7,256>::alloc(); case 6: return pool<6,1<<6,256>::alloc(); case 5: default: return pool<5,1<<5,256>::alloc(); } assert( !"we shoudln't get here!" ); } void do_hash_free(char* c) { assert( c != nullptr ); uint8_t a = *(c-1); // alignment slot_header* sh = (slot_header*)(c-8);// TODO: handle alignment assert( a == 8 ); if( !(sh->pool_id >=5 && sh->pool_id <= 16 ) ) { PRINT( std::cerr<< "ERROR: pool_id: "<pool_id<<"\n"; std::cerr.flush(); assert( sh->pool_id >=5 && sh->pool_id <= 16 ); ); } switch( sh->pool_id ) { case 16: pool<16,1<<16,8>::free(c); return; case 15: pool<15,1<<15,16>::free(c); return; case 14: pool<14,1<<14,32>::free(c); return; case 13: pool<13,1<<13,64>::free(c); return; case 12: pool<12,1<<12,64>::free(c); return; case 11: pool<11,1<<11,64>::free(c); return; case 10: pool<10,1<<10,128>::free(c); return; case 9: pool<9,1<<9,128>::free(c); return; case 8: pool<8,1<<8,128>::free(c); return; case 7: pool<7,1<<7,256>::free(c); return; case 6: pool<6,1<<6,256>::free(c); return; case 5: default: pool<5,1<<5,256>::free(c); return; } assert( !"we shoudln't get here!" ); } int main( int argc, char** argv ) { if( argc > 1 && argv[1][0] == 'm' ) { std::cerr<<"malloc multi\n"; pc_bench( do_malloc, do_malloc_free ); } if( argc > 1 && argv[1][0] == 'M' ) { std::cerr<<"hash malloc multi\n"; pc_bench( do_hash_malloc, do_hash_free ); } if( argc > 1 && argv[1][0] == 's' ) { std::cerr<<"malloc single\n"; pc_bench_st( do_malloc, do_malloc_free ); } if( argc > 1 && argv[1][0] == 'S' ) { std::cerr<<"hash malloc single\n"; pc_bench_st( do_hash_malloc, do_hash_free ); } return 0; } ================================================ FILE: ideas.txt ================================================ Global Ready Queue per Size Class of 256 each.... combined with 16 per thread per size assuming 16 threads... class means that in the 'idle state' we have Size allocations are not 'random', but usually fall into predictable patterns. The 'ideal' buffer size is one that is never full and never empty... if it ever empties then the next time you fill it you should fill it 'fuller' than the last time... and attempt to keep it there. If the buffer is 'full' when you check then you can start reclaiming data from that buffer. GC Thread: For each size class... maintain a hash 'set' of free chunks in that set. When a new chunk comes in, look for its 'prev' in its hash set, if found remove it, merge the two... then look for the 'next' if found merge the two... then store the result back in the new hash table after checking to see if the queue for that size class is waiting for data. GC Thread Loop: { foreach thread_garabage_bin pull all chunks, insert them into merge set, then merge them if possible foreach size class refill the queue if queue was empty... grow the queue by 4 if queue was full... increment full count if full count > N then reclaim 25% and reset full count. pull chunks from proper size heap... - if not enough are available then divide up chunks from the next size up. if a chunk reaches the 'page size' and the 'page size' block queue is empty then we can release it back to the OS. when there is no merging / reclaiming to do... set a flag and wait on a mutex... next person to call free will wake me up when they see the flag set. When choosing empty chunks to place in the queue... pick the chunk from the block with the 'oldest' creation time. This optimization requires more expensive 'sorting', we can skip this step when ever there is demand for 'all chunks' of a particular class size, but when there is only demand for a fraction of the available chunks, then, because we are scanning the hash table linerally... Each node in the hash table points to prev/next pairs... when a hash is 'inserted' its memory location is based on its hash value, but its prev/next is based upon order of arival. Thus you can quickly find a node, then extract like a double-linked-list. } Merge Cost: 2 hash lookups + 1 hash set and perhaps 2 hash clears 3 total calls to city hash... The 'free queue' can be a linked list of the 'freed chunks'. Each thread has its 'ready bin' which it will set 'if null', and its pending bin which it will fill if the ready bin is not null. the memory space in the block is converted into a 'next' pointer. no large per-thread 'free queues'. queues will adjust in length until they can handle the 'burst' processing rate of the GC thread. When the GC thread cannot keep the queues full, then threads fall back on directly allocating their own chunks. Overhead per block.. 8 byte header + 4 byte in free table or 8 byte in queue. Queue sizes adjust Header: prev + next offsets. start of mmap chunk sets prev to 0 end of mmap chunk is a header with next = 0. ================================================ FILE: malloc2.cpp ================================================ /** * Each thread has its own 'arena' where it can allocate 'new' blocks of what ever size it needs (buckets). After * a thread is done with memory it places it in a garbage collection queue. * * The garbage collector follows each threads trash bin and moves the blocks into a recycled list that * all other threads can pull from. * * The garbage collector can grow these queues as necessary and shrink them as time progresses. */ #include //#include "mmap_alloc.hpp" #include "disruptor.hpp" #include #include "fast_rand.cpp" using namespace disruptor; #define PAGE_SIZE (4*1024*1024) #define BENCH_SIZE ( (2024) ) #define ROUNDS 200000 #define LOG2(X) ((unsigned) (8*sizeof (unsigned long long) - __builtin_clzll((X)) - 1)) struct block_header { uint32_t _page_pos; // how far from start of page uint32_t _prev; uint32_t _next; uint32_t _timestamp;// creation time... we want to use 'old blocks' first // because they are most likley to contain long-lived objects size_t calc_size(){ return _next - _page_pos; } int calc_bin_num(){ return LOG2(calc_size())+1; } }; block_header* allocate_block_page(); /** * 2MB chunk of memory that gets divided up * 'on request', rounded to the nearest multiple * of 128 bytes so that it can be binned/cached * effectively. */ struct page { block_header data[PAGE_SIZE/sizeof(block_header)]; }; class thread_allocator { public: void free( char* c ) { block_header* b = reinterpret_cast(c) - 1; int bin = b->calc_bin_num(); if( _cache_pos[bin] > _cache_end[bin] - 32 ) { _cache[bin].at(_cache_end[bin]++) = c; return; } auto pos = _gc_read_end_buffer; _garbage_bin.at(pos) = c; _gc_read_end_buffer = pos + 1; /* _gc_read_end_buffer = pos + 1; */ if( _gc_read_end_buffer - _gc_read_end_last_write > 10 ) { _gc_read_end = _gc_read_end_last_write = _gc_read_end_buffer; } } char* alloc( size_t s ); static thread_allocator& get() { static __thread thread_allocator* tld = nullptr; if( !tld ) // new is not an option { tld = reinterpret_cast( malloc(sizeof(thread_allocator))/*mmap_alloc( sizeof(thread_allocator)*/ ); tld = new (tld) thread_allocator(); // inplace construction // TODO: allocate pthread_threadlocal var, attach a destructor /clean up callback // to that variable... } return *tld; } protected: char* split_chunk( char* c, size_t l ); thread_allocator(); ~thread_allocator(); friend class garbage_collector; int64_t _gc_begin; // how far has gc processed int64_t _pad[7]; // save the cache lines/prevent false sharing int64_t _gc_read_end; // how far can gc read int64_t _pad2[7]; // save the cache lines/prevent false sharing int64_t _gc_read_end_buffer; // cache writes to gc_read_end to every 10 writes int64_t _gc_read_end_last_write; // cache writes to gc_read_end to every 10 writes int64_t _cache_pos[32]; int64_t _cache_end[32]; char* get_garbage( int64_t pos ) // grab a pointer previously claimed. { // we may have to dynamically reallocate our gbin return _garbage_bin.at(pos); } block_header* _next_block; ring_buffer _garbage_bin; ring_buffer _cache[32]; }; typedef thread_allocator* thread_alloc_ptr; /** * Polls all threads for freed items. * Upon receiving a freed item, it will look * at its size and move it to the proper recycle * bin for other threads to consume. * * When there is less work to do, the garbage collector * will attempt to combine blocks into larger blocks * and move them to larger cache sizes until it * ultimately 'completes a page' and returns it to * the system. * * From the perspective of the 'system' an alloc * involves a single atomic fetch_add. * * A free involves a non-atomic store. * * No other sync is necessary. */ class garbage_collector { public: garbage_collector(); ~garbage_collector(); /** * Handles objects of the same size. */ class recycle_bin { public: recycle_bin(int num = 0) :_next_write(0),_write_pos(0),_read_pos(0),_bin_num(num) { } void sync_write_pos() { // ((std::atomic*)&_write_pos)->load(); } int64_t _next_write; int64_t _pad0[7]; int64_t _write_pos; int64_t _pad[7]; std::atomic _read_pos; int64_t _pad2[7]; ring_buffer _free_bin; int _bin_num; }; std::atomic _sync; int get_bin_num( size_t s ) { return LOG2(s)+1; } recycle_bin& get_bin( size_t bin_num ) { assert( bin_num < 32 ); return _bins[bin_num]; } void register_allocator( thread_alloc_ptr ta ); void unregister_allocator( thread_alloc_ptr ta ); static garbage_collector& get() { static garbage_collector gc; return gc; } private: static void run(); void recycle( char* c ); std::thread _thread; recycle_bin _bins[32]; std::atomic _next_talloc; thread_alloc_ptr _tallocs[128]; static std::atomic _done; }; std::atomic garbage_collector::_done(false); garbage_collector::garbage_collector() :_thread( &garbage_collector::run ) { memset( _tallocs, 0, sizeof(_tallocs) ); } garbage_collector::~garbage_collector() { _done.store(true, std::memory_order_release ); _thread.join(); } void garbage_collector::register_allocator( thread_alloc_ptr ta ) { printf( "registering thread allocator %p\n", ta ); // TODO: just lock here... auto pos = _next_talloc.fetch_add(1); _tallocs[pos] = ta; } void garbage_collector::unregister_allocator( thread_alloc_ptr ta ) { for( int i = 0; i < 128; ++i ) { if( _tallocs[i] == ta ) { _tallocs[i] = nullptr; } } } void garbage_collector::run() { garbage_collector& self = garbage_collector::get(); while( true ) { bool found_work = false; for( int i = 0; i < 128; i++ ) { // TODO: not safe assumption, threads can come/go at will // leaving holes... thread cleanup code needs locks around it // to prevent holes.. if( self._tallocs[i] != nullptr ) { auto b = self._tallocs[i]->_gc_begin; auto e = self._tallocs[i]->_gc_read_end; if( b != e ) found_work = true; for( auto p = b; p < e; ++p ) { char* c = self._tallocs[i]->get_garbage(p); self.recycle( c); } self._tallocs[i]->_gc_begin = e; } } if( !found_work ) { // usleep(0); if( _done.load( std::memory_order_acquire ) ) return; } } } void garbage_collector::recycle( char* c ) { block_header* h = ((block_header*)c)-1; assert( h->_next - h->_page_pos > 0 ); recycle_bin& b = get_bin( get_bin_num(h->_next - h->_page_pos) ); auto p = b._next_write++; while( b._free_bin.at(p) != nullptr ) { // fprintf( stderr, "opps.. someone left something behind...\n" ); p = b._next_write++; } b._free_bin.at(p) = c; b._write_pos = p; // if( b._write_pos % 256 == 128 ) // b.sync_write_pos(); } block_header* allocate_block_page() { fprintf( stderr, "#" ); auto limit = malloc(PAGE_SIZE);//mmap_alloc( PAGE_SIZE ); block_header* _next_block = reinterpret_cast(limit); _next_block->_page_pos = 0; _next_block->_prev = 0; _next_block->_next = PAGE_SIZE; // next block always goes to end...; _next_block->_timestamp = 0; // TODO... return _next_block; } thread_allocator::thread_allocator() { _gc_begin = 0; _gc_read_end = 0; _gc_read_end_buffer = 0; _gc_read_end_last_write = 0; _next_block = allocate_block_page(); memset( _cache_pos, 0, sizeof(_cache_pos) ); memset( _cache_end, 0, sizeof(_cache_end) ); garbage_collector::get().register_allocator(this); } thread_allocator::~thread_allocator() { // give the rest of our allocated chunks to the gc thread free( reinterpret_cast(_next_block+1) ); garbage_collector::get().unregister_allocator(this); // GARBAGE COLLECTOR must do the mmap free because we don't know // when it will notice this thread going away... // TODO: post a message to GC to track thread cleanup. // mmap_free( this, sizeof(*this) ); } /** * returns len bytes starting at s, potentially freeing * anything after s+len. */ char* thread_allocator::split_chunk( char* s, size_t len ) { return s; } char* thread_allocator::alloc( size_t s ) { assert( s > 0 ); s = 64*((s + 63)/64); // multiples of 64 bytes if( s+sizeof(block_header) >= PAGE_SIZE ) { assert( false ); // do direct mmap return nullptr; } int bin_num = garbage_collector::get().get_bin_num( s ); int limit = std::min(bin_num + 4,32); for( int i = bin_num; i < limit; ++i ) { if( _cache_pos[i] < _cache_end[i] ) { char* c = _cache[i].at(_cache_pos[i]); ++_cache_pos[i]; return split_chunk( c, s ); } } static int64_t hit = 0; static int64_t miss = 0; static int64_t sync_count = 0; ++sync_count; // if( sync_count % 64 == 63 ) // rb->sync_write_pos(); int end_bin = bin_num+1;// + 4; for( ; bin_num < end_bin; ++ bin_num ) { garbage_collector::recycle_bin* rb = &garbage_collector::get().get_bin( bin_num ); while( rb ) { // TODO: ATOMIC ... switch to non-atomic check auto write_pos = rb->_write_pos; // printf( "recyclebin wirte_pos: %d read_cur.begin %d\n", write_pos, rb->_read_cur.pos().aquire() ); auto avail = write_pos - *((int64_t*)&rb->_read_pos); if( avail > 16 )// /*.load( std::memory_order_relaxed )*/ < write_pos ) { // ATOMIC CLAIM FROM SHARED POOL... MOST EXPENSIVE OP WE HAVE... //auto pos = rb->_read_cur.pos().atomic_increment_and_get(1)-1; //auto pos = rb->_read_pos.fetch_add(4,std::memory_order_relaxed); auto pos = rb->_read_pos.fetch_add(8);//,std::memory_order_acquire); auto e = pos + 8; while( pos < e ) { char* b = rb->_free_bin.at(pos); if( b ) { _cache[bin_num].at(_cache_end[bin_num]++) = b; rb->_free_bin.at(pos) = nullptr; } else { // fprintf( stderr, "read too much..\n" ); } ++pos; } if( _cache_pos[bin_num] < _cache_end[bin_num] ) { char* c = _cache[bin_num].at(_cache_pos[bin_num]); ++_cache_pos[bin_num]; ++hit; return c; } } // else there are no blocks our size... go up a size or two?.. break; } ++miss; // if( miss % 10000 == 0 ) fprintf( stderr, "\nHit: %lld Miss: %lld \r", hit, miss ); } // we already checked the 'best fit' bin and failed to find // anything that size ready, so we can allocate it from our // thread local block // printf( "allocating new chunk from thread local page\n" ); // make sure the thread local block has enough space... if( _next_block->_page_pos + s + sizeof(block_header) >= PAGE_SIZE ) { // not enough space left in current block.. free it... if it has any space at all. if( _next_block->_page_pos != PAGE_SIZE ) { free( (char*)(_next_block+1) ); } _next_block = allocate_block_page(); assert( _next_block != nullptr ); } // fprintf( stderr, "alloc %d at block pos %d\n", s+1, _next_block->_page_pos ); block_header* new_b = _next_block; _next_block = new_b + 1 + s/sizeof(block_header); _next_block->_page_pos = new_b->_page_pos + sizeof(block_header) + s; _next_block->_prev = new_b->_page_pos; _next_block->_next = PAGE_SIZE; // next block always goes to end... _next_block->_timestamp = new_b->_timestamp; // TODO... new_b->_next = _next_block->_page_pos; // our work here is done give them the newly allocated block (pointing after the header return reinterpret_cast(new_b+1); } char* malloc2( int s ) { return thread_allocator::get().alloc(s); } void free2( char* s ) { return thread_allocator::get().free(s); } /* SEQUENTIAL BENCH int main( int argc, char** argv ) { if( argc == 2 && argv[1][0] == 'S' ) { printf( "malloc2\n"); for( int i = 0; i < 50000000; ++i ) { char* test = malloc2( 128 ); assert( test != nullptr ); test[0] = 1; free2( test ); } } if( argc == 2 && argv[1][0] == 's' ) { printf( "malloc\n"); for( int i = 0; i < 50000000; ++i ) { char* test = (char*)malloc( 128 ); assert( test != nullptr ); test[0] = 1; free( test ); } } fprintf( stderr, "done\n"); // sleep(5); return 0; } */ /* RANDOM BENCH */ std::vector buffers[16]; void pc_bench_worker( int pro, int con, char* (*do_alloc)(int s), void (*do_free)(char*) ) { for( int r = 0; r < ROUNDS; ++r ) { for( int x = 0; x < buffers[pro].size()/2 ; ++x ) { uint32_t p = fast_rand() % buffers[pro].size(); if( !buffers[pro][p] ) { uint64_t si = 32 + fast_rand()%(8096*16); //4000;//32 + fast_rand() % (1<<16); auto r = do_alloc( si ); assert( r != nullptr ); // assert( r[0] != 99 ); // r[0] = 99; buffers[pro][p] = r; } } for( int x = 0; x < buffers[con].size()/2 ; ++x ) { uint32_t p = fast_rand() % buffers[con].size(); assert( p < buffers[con].size() ); assert( con < 16 ); assert( con >= 0 ); if( buffers[con][p] ) { //assert( buffers[con][p][0] == 99 ); // buffers[con][p][0] = 0; do_free(buffers[con][p]); buffers[con][p] = 0; } } } } void pc_bench(int n, char* (*do_alloc)(int s), void (*do_free)(char*) ) { for( int i = 0; i < 16; ++i ) { buffers[i].resize( BENCH_SIZE ); memset( buffers[i].data(), 0, 8 * BENCH_SIZE ); } std::thread* a = nullptr; std::thread* b = nullptr; std::thread* c = nullptr; std::thread* d = nullptr; std::thread* e = nullptr; std::thread* f = nullptr; std::thread* g = nullptr; std::thread* h = nullptr; std::thread* i = nullptr; std::thread* j = nullptr; int s = 1; switch( n ) { case 10: a = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 9: b = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 8: c = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 7: d = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 6: e = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 5: f = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 4: g = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 3: h = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 2: i = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); n--; s++; case 1: j = new std::thread( [=](){ pc_bench_worker( n, s, do_alloc, do_free ); } ); } if(a) a->join(); if(b) b->join(); if(c) c->join(); if(d) d->join(); if(e) e->join(); if(f) f->join(); if(g) g->join(); if(h) h->join(); if(i) i->join(); if(j) j->join(); } void pc_bench_st(char* (*do_alloc)(int s), void (*do_free)(char*) ) { for( int i = 0; i < 16; ++i ) { buffers[i].resize( BENCH_SIZE ); memset( buffers[i].data(), 0, 8 * BENCH_SIZE ); } int i = 0; std::thread a( [=](){ pc_bench_worker( 1, 1, do_alloc, do_free ); } ); a.join(); } #include char* do_malloc(int s) { // return (char*)::malloc(s); return (char*)scalable_malloc(s); } void do_malloc_free(char* c) { scalable_free(c); // ::free(c); } int main( int argc, char** argv ) { if( argc > 2 && argv[1][0] == 'm' ) { std::cerr<<"malloc multi\n"; pc_bench( atoi(argv[2]), do_malloc, do_malloc_free ); } if( argc > 2 && argv[1][0] == 'M' ) { std::cerr<<"hash malloc multi\n"; pc_bench( atoi(argv[2]), malloc2, free2 ); } if( argc > 1 && argv[1][0] == 's' ) { std::cerr<<"malloc single\n"; pc_bench_st( do_malloc, do_malloc_free ); } if( argc > 1 && argv[1][0] == 'S' ) { std::cerr<<"hash malloc single\n"; pc_bench_st( malloc2, free2 ); } return 0; } ================================================ FILE: malloc2.hpp ================================================ ================================================ FILE: malloc3.cpp ================================================ /** * Each thread has its own 'arena' where it can allocate 'new' blocks of what ever size it needs (buckets). After * a thread is done with memory it places it in a garbage collection queue. * * The garbage collector follows each threads trash bin and moves the blocks into a recycled list that * all other threads can pull from. * * The garbage collector can grow these queues as necessary and shrink them as time progresses. */ #include #include #include "mmap_alloc.hpp" #include "disruptor.hpp" #include #include #include #include #include #include #include #include #include #include //#include "rand.cpp" using namespace disruptor; #define PAGE_SIZE (4*1024*1024) #define BENCH_SIZE ( (1024) ) #define ROUNDS 200000 #define LOG2(X) ((unsigned) (8*sizeof (unsigned long long) - __builtin_clzll((X)) - 1)) #define NUM_BINS 32 // log2(PAGE_SIZE) class block_header { public: block_header* next() { assert(this); if( _size > 0 ) return reinterpret_cast(data()+_size); else return nullptr; } block_header* prev() { assert(this); if( _prev_size <= 0 ) return nullptr; return reinterpret_cast(reinterpret_cast(this) - _prev_size - 8); } enum flags_enum { unknown = 0, idle = 1, // in storage, mergable queued = 2, // in waiting queue... cached = 4, // cached in thread active = 8, // in use by app mergable = 16 // track this or will false sharing kill me? }; struct queue_state // the block is serving as a linked-list node { block_header* next; block_header* prev; }; void set_state( flags_enum e ) { _flags = e; } flags_enum get_state() { return (flags_enum)_flags; } queue_state& as_queue_node() { return *reinterpret_cast(data()); } queue_state& init_as_queue_node() { // _flags |= queued; queue_state& s = as_queue_node(); s.next = nullptr; s.prev = nullptr; return s; } void init( int s ) { _prev_size = 0; _size = - (s-8); } char* data() { return ((char*)this)+8; } int size()const { return abs(_size); } int raw_size()const { return _size; } int raw_prev_size()const { return _prev_size; } int calc_forward_extent() { // fprintf( stderr, "pos %p + %d -> ", this, _size ); int s = size() + 8; auto n = next(); if( n ) s += n->calc_forward_extent(); return s; } int page_size() { auto h = head(); assert(h); return head()->calc_forward_extent(); } block_header* head() { auto pre = prev(); if( !pre ) return this; do { auto next_prev = pre->prev(); if( !next_prev ) return pre; pre = next_prev; } while ( true ); } /** create a new block at p and return it */ block_header* split_after( int s ) { assert( s >= 32 ); // fprintf( stderr, "prev_size %d _size %d Initial Error: %d\n", _prev_size, _size, int(PAGE_SIZE - this->page_size()) ); assert( PAGE_SIZE == page_size() ); if( (size() - 8 -32) < s ) return nullptr;// no point in splitting to less than 32 bytes block_header* n = reinterpret_cast(data()+s); n->_prev_size = s; n->_size = size() -s -8; if( _size < 0 ) n->_size = -n->_size; // we just split the tail _size = s; // this node now has size s assert( size() >= s ); assert( PAGE_SIZE == n->page_size() ); assert( PAGE_SIZE == page_size() ); return n; } // merge this block with next, return head of new block. block_header* merge_next() { assert( PAGE_SIZE == page_size() ); assert( _flags == block_header::idle ); auto nxt = next(); if( !nxt ) return this; assert( nxt->page_size() == PAGE_SIZE ); // next must be in the idle state if( nxt->_flags != idle ) return this; // extract node from the double link list it is in. queue_state& qs = nxt->as_queue_node(); if( qs.next ) { // assert( qs.next->as_queue_node().prev == nxt ); qs.next->as_queue_node().prev = qs.prev; } if( qs.prev ) { // assert( qs.prev->as_queue_node().next == nxt ); qs.prev->as_queue_node().next = qs.next; } // now we are free to merge the memory _size += nxt->size() + 8; fprintf( stderr, "merged to size %d\n", _size ); if( nxt->_size < 0 ) _size = -_size; nxt = next(); // find the new next. if( nxt ) { nxt->_prev_size = size(); } assert( PAGE_SIZE == page_size() ); if( next() ) assert( PAGE_SIZE == next()->page_size() ); if( prev() ) assert( PAGE_SIZE == prev()->page_size() ); return this; } // merge this block with the prev, return the head of new block block_header* merge_prev() { _flags = idle; // mark myself as idle/mergable auto p = prev(); if( !p ) return this; if( p->_flags != idle ) return this; return p->merge_next(); } private: int32_t _prev_size; // size of previous header. int32_t _size:24; // offset to next, negitive indicates tail, 8 MB max, it could be neg int32_t _flags:8; // offset to next, negitive indicates tail }; static_assert( sizeof(block_header) == 8, "Compiler is not packing data" ); /** returns a new block page allocated via mmap * The page has 2 block headers (head+tail) defined * and head is returned. **/ block_header* allocate_block_page(); struct block_list_node { block_list_node():next(nullptr){}; block_list_node* next; block_header* header() { return reinterpret_cast(reinterpret_cast(this)-8); } int count() { int count = 1; auto n = next; while( n ) { ++count; assert( count < 1000 ); n = n->next; } return count; } block_list_node* find_end() { block_list_node* n = this; while( n->next ) { n = n->next; } return n; } }; class thread_allocator { public: char* alloc( size_t s ); void free( char* c ) { auto node = reinterpret_cast(c-8); // store a point node->init_as_queue_node().next = _gc_on_deck; if( !_gc_at_bat ) { _gc_at_bat = node; _gc_on_deck = nullptr; } else { _gc_on_deck = node; } } static thread_allocator& get() { static __thread thread_allocator* tld = nullptr; if( !tld ) // new is not an option { tld = reinterpret_cast( mmap_alloc( sizeof(thread_allocator) ) ); tld = new (tld) thread_allocator(); // inplace construction // TODO: allocate pthread_threadlocal var, attach a destructor /clean up callback // to that variable... } return *tld; } void print_cache() { for( int i = 0; i < NUM_BINS; ++i ) { fprintf( stderr, "%d] size %d \n", i, _bin_cache_size[i] ); } } protected: bool store_cache( block_header* h ) { assert( h->page_size() == PAGE_SIZE ); auto bin = LOG2( h->size() ); if( _bin_cache[bin] == nullptr ) { _bin_cache[bin] = h; return true; } return false; /* assert( h != nullptr ); if( _bin_cache_size[bin] < 4 ) { if( _bin_cache_size[bin] == 0 ) assert( nullptr == _bin_cache[bin] ); block_list_node* bln = reinterpret_cast(h->data() ); bln->next = _bin_cache[bin]; _bin_cache[bin] = bln; _bin_cache_size[bin]++; assert( _bin_cache_size[bin] == _bin_cache[bin]->count() ); return true; } fprintf( stderr, "cache full bin %d size %d", bin, _bin_cache_size[bin] ); assert( _bin_cache[bin] != nullptr ); return false; */ } block_header* fetch_cache( int bin ) { if( _bin_cache[bin] ) { block_header* b = _bin_cache[bin]; assert( b->page_size() == PAGE_SIZE ); _bin_cache[bin] = nullptr; return b; } return nullptr; /* if( _bin_cache_size[bin] > 0 ) { assert( _bin_cache_size[bin] == _bin_cache[bin]->count() ); assert( _bin_cache[bin] ); auto h = _bin_cache[bin]; _bin_cache[bin] = h->next; _bin_cache_size[bin]--; auto head = h->header(); assert( head->page_size() == PAGE_SIZE ); assert( LOG2(head->size()) >= bin ); assert( LOG2(head->size()) == bin ); return head; } assert( !_bin_cache[bin] ); */ return nullptr; } block_header* fetch_block_from_bin( int bin ); thread_allocator(); ~thread_allocator(); friend class garbage_collector; bool _done; // cleanup and remove from list. std::atomic _gc_at_bat; // where the gc pulls from. uint64_t _gc_pad[7]; // gc thread and this thread should not false-share these values block_header* _gc_on_deck; // where we save frees while waiting on gc to bat. /** * called by gc thread and pops the at-bat free list */ block_header* get_garbage() // grab a pointer previously claimed. { if( block_header* gar = _gc_at_bat.load() ) { _gc_at_bat.store(nullptr);// = nullptr; return gar; } return nullptr; } block_header* _bin_cache[NUM_BINS]; // head of cache for specific bin int16_t _bin_cache_size[NUM_BINS]; // track num of nodes in cache thread_allocator* _next; // used by gc to link thread_allocs together }; typedef thread_allocator* thread_alloc_ptr; /** * Polls all threads for freed items. * Upon receiving a freed item, it will look * at its size and move it to the proper recycle * bin for other threads to consume. * * When there is less work to do, the garbage collector * will attempt to combine blocks into larger blocks * and move them to larger cache sizes until it * ultimately 'completes a page' and returns it to * the system. * * From the perspective of the 'system' an alloc * involves a single atomic fetch_add. * * A free involves a non-atomic store. * * No other sync is necessary. */ class garbage_collector { public: garbage_collector(); ~garbage_collector(); class recycle_bin { public: recycle_bin() :_read_pos(0),_full_count(0),_full(2),_write_pos(0) { memset( &_free_queue, 0, sizeof(_free_queue) ); _free_list = nullptr; } // read the _read_pos without any atomic sync, we only care about an estimate int64_t available() { return _write_pos - *((int64_t*)&_read_pos); } // reserve right to read the next num spots from buffer int64_t claim( int64_t num ) { return _read_pos.fetch_add(num); } block_header* get_block( int64_t claim_pos ) { return _free_queue.at(claim_pos); } void clear_block( int64_t claim_pos ) { _free_queue.at(claim_pos) = nullptr; } // determines how many chunks should be required to consider this bin full. // TODO: this method needs to be tweaked to factor in 'time'... as it stands // now the GC loop will be very agressive at shrinking the queue size int64_t check_status() { return 8 - available(); /* auto av = available(); int consumed = _last_fill - av; if( consumed > _last_fill/2 ) ++_full; if( av <= 0 ) { // apparently there is high demand, the consumers cleaned us out. _full *= 2; // exponential growth.. _full = std::min( _full+4, _free_queue.get_buffer_size() -1 ); fprintf( stderr, "%d blocks available, _full %d\n", int(av), int(_full) ); } else if( av == _full ) { // apparently no one wanted any... we should shrink what we consider full _full -= 4; // fast back off if( _full < 2 ) _full = 2; } else // av < _full { // some, but not all have been consumed... // if less than half have been consumed... reduce size, // else keep the size the same. if( av > _full/2 ) { _full--; // reduce full size,slow back off if( _full < 2 ) _full = 2; return _full - av; } else // more than half consumed... keep full size the same, refill { } } fprintf( stderr, "%d blocks available, _full %d post %d\n", int(av), int(_full), int(_full-av) ); return _full - av; */ } ring_buffer _free_queue; std::atomic _read_pos; //written to by read threads int64_t _pad[7]; // below this point is written to by gc thread int64_t _full_count; // how many times gc thread checked and found the queue full int64_t _full; // limit the number of blocks kept in queue int64_t _write_pos; // read by consumers to know the last valid entry. int64_t _last_fill; // status of the buffer at the last check. void push( block_header* h ) { h->set_state( block_header::idle ); block_header::queue_state& qs = h->init_as_queue_node(); qs.next = _free_list; if( _free_list ) { _free_list->as_queue_node().prev = h; } _free_list = h; } block_header* pop() { auto tmp = _free_list; if( _free_list ) { auto n = _free_list->as_queue_node().next; if( n ) n->as_queue_node().prev = nullptr; _free_list = n; assert( tmp->get_state() == block_header::idle ); tmp->set_state( block_header::unknown ); // TODO: only if DEBUG } return tmp; } // blocks are stored as a double-linked list block_header* _free_list; }; recycle_bin& find_cache_bin_for( block_header* h ) { assert(h!=nullptr); int bn = get_bin_num(h->size()); // fprintf( stderr, "block header size %d is cached in bin %d holding sizes %d\n", (int)h->size(), bn, (1<<(bn)) ); return get_bin(get_bin_num( h->size() )); } int get_bin_num( size_t s ) { return LOG2(s); } recycle_bin& get_bin( size_t bin_num ) { assert( bin_num < NUM_BINS ); return _bins[bin_num]; } void register_allocator( thread_alloc_ptr ta ); static garbage_collector& get() { static garbage_collector gc; return gc; } private: static void run(); // threads that we are actively looping on std::atomic _thread_head; std::thread _thread; // gc thread.. doing the hard work recycle_bin _bins[NUM_BINS]; static std::atomic _done; }; std::atomic garbage_collector::_done(false); garbage_collector::garbage_collector() :_thread_head(nullptr),_thread( &garbage_collector::run ) { fprintf( stderr, "allocating garbage collector\n" ); } garbage_collector::~garbage_collector() { _done.store(true, std::memory_order_release ); _thread.join(); } void garbage_collector::register_allocator( thread_alloc_ptr ta ) { printf( "registering thread allocator %p\n", ta ); auto* stale_head = _thread_head.load(std::memory_order_relaxed); do { ta->_next = stale_head; }while( !_thread_head.compare_exchange_weak( stale_head, ta, std::memory_order_release ) ); } void garbage_collector::run() { fprintf( stderr, "Starting GC loop\n"); try { garbage_collector& self = garbage_collector::get(); while( true ) { thread_alloc_ptr cur_al = *((thread_alloc_ptr*)&self._thread_head); bool found_work = false; // for each thread, grab all of the free chunks and move them into // the proper free set bin, but save the list for a follow-up merge // that takes into consideration all free chunks. while( cur_al ) { auto cur = cur_al->get_garbage(); if( cur ) { assert( cur->page_size() == PAGE_SIZE ); found_work = true; } while( cur ) { assert( cur->page_size() == PAGE_SIZE ); block_header* nxt = cur->as_queue_node().next; assert( nxt != cur ); if( nxt ) assert( nxt->page_size() == PAGE_SIZE ); assert( cur->page_size() == PAGE_SIZE ); auto before = cur->size(); // fprintf( stderr, "found free block of size: %d\n", cur->size() ); cur->init_as_queue_node(); assert( cur->page_size() == PAGE_SIZE ); cur->set_state( block_header::idle ); assert( cur->page_size() == PAGE_SIZE ); cur = cur->merge_next(); // cur = cur->merge_prev(); if( before != cur->size() ) fprintf( stderr, "found free block of after merges..: %d\n", cur->size() ); assert( cur->page_size() == PAGE_SIZE ); recycle_bin& c_bin = self.find_cache_bin_for(cur); assert( cur->page_size() == PAGE_SIZE ); // fprintf( stderr, "pushing into bin\n" ); c_bin.push(cur); assert( cur->page_size() == PAGE_SIZE ); cur = nxt; assert( cur->page_size() == PAGE_SIZE ); } assert( cur_al != cur_al->_next ); // get the next thread. cur_al = cur_al->_next; } // for each recycle bin, check the queue to see if it // is getting low and if so, put some chunks in play for( int i = 0; i < NUM_BINS; ++i ) { garbage_collector::recycle_bin& bin = self._bins[i]; auto needed = bin.check_status(); // returns the number of chunks need if( needed > 0 ) { int64_t next_write_pos = bin._write_pos; block_header* next = bin.pop(); while( next && needed > 0 ) { // fprintf( stderr, "poping block from bin %d and pushing into queue\n", i ); found_work = true; ++next_write_pos; if( bin._free_queue.at(next_write_pos) ) { // someone left something behind... } else { bin._free_queue.at(next_write_pos) = next; next = bin.pop(); } --needed; } if( next ) bin.push(next); // leftover... bin._write_pos = next_write_pos; } else if( needed < 0 ) { // apparently no one is checking this size class anymore, we can reclaim some nodes. // TODO: perhaps we only do this if there is no other work found as work implies // that the user is still allocating / freeing objects and thus we don't want to // compete to start freeing cache yet... } } if( !found_work ) usleep( 1000 ); if( _done.load( std::memory_order_acquire ) ) return; if( !found_work ) { // reclaim cache // sort... and optimize.... } } } catch ( ... ) { fprintf( stderr, "gc caught exception\n" ); } fprintf( stderr, "exiting gc loop\n" ); } block_header* allocate_block_page() { fprintf( stderr, "\n\n ALLOCATING NEW PAGE\n\n" ); auto limit = mmap_alloc( PAGE_SIZE ); block_header* bl = reinterpret_cast(limit); bl->init( PAGE_SIZE ); return bl; } thread_allocator::thread_allocator() { _done = false; _next = nullptr; //_gc_at_bat = nullptr; _gc_on_deck = nullptr; memset( _bin_cache, 0, sizeof(_bin_cache) ); memset( _bin_cache_size, 0, sizeof(_bin_cache_size) ); garbage_collector::get().register_allocator(this); } thread_allocator::~thread_allocator() { // give the rest of our allocated chunks to the gc thread // free all cache, free _alloc_block _done = true; } int get_min_bin( size_t s ) { return LOG2(s)+1; } char* thread_allocator::alloc( size_t s ) { // fprintf( stderr, " alloc %d\n", (int)s ); if( s == 0 ) return nullptr; size_t data_size = s; // we need 8 bytes for the header, then round to the nearest // power of 2. int min_bin = LOG2(s+7)+1; // this is the bin size. s = (1<= data_size ); for( int bin = min_bin; bin < NUM_BINS; ++bin ) { block_header* b = fetch_block_from_bin(bin); if( b ) { fprintf( stderr, "found cache in bin %d\r", bin ); assert( b->page_size() == PAGE_SIZE ); block_header* tail = b->split_after( s ); assert( b->page_size() == PAGE_SIZE ); if( tail ) assert( tail->page_size() == PAGE_SIZE ); assert( b->size() >= s ); if( tail && !store_cache( tail ) ) { fprintf( stderr, "unable to cache tail, free it\n" ); this->free( tail->data() ); } assert( b->size() >= s ); return b->data(); } } block_header* new_page = allocate_block_page(); //printf( " alloc new block page %p _size %d _prev_size %d next %p prev %p\n", // new_page, new_page->_size, new_page->_prev_size, new_page->next(), new_page->prev() ); block_header* tail = new_page->split_after(s); // printf( " alloc free tail %p _size %d _prev_size %d next %p prev %p tail %p\n", // tail, tail->_size, tail->_prev_size, tail->next(), tail->prev(), tail ); if( tail && !store_cache( tail ) ) { this->free( tail->data() ); } assert( new_page->size() >= s-8 ); return new_page->data(); } /** * Checks our local bin first, then checks the global bin. * * @return null if no block found in cache. */ block_header* thread_allocator::fetch_block_from_bin( int bin ) { // fprintf( stderr, "fetch cache %d has %d items remaining\n", bin, int(_bin_cache_size[bin]) ); auto lo = fetch_cache(bin); if( lo ) return lo; assert( _bin_cache_size[bin] == 0 ); garbage_collector& gc = garbage_collector::get(); garbage_collector::recycle_bin& rb = gc.get_bin( bin ); if( auto avail = rb.available() ) { // claim up to half of the available, just incase 2 // threads try to claim at once, they both can, but // don't hold a cache of more than 4 items auto claim_num = 2;//std::min( avail/2, 1 ); // claim_num could now be 0 to 3 //claim_num++; // claim at least 1 and at most 4 // this is our one and only atomic 'sync' operation... auto claim_pos = rb.claim( claim_num ); auto claim_end = claim_pos + claim_num; bool found = false; while( claim_pos != claim_end ) { block_header* h = rb.get_block(claim_pos); if( h ) { found = true; rb.clear_block(claim_pos); // let gc know we took it. ++claim_pos; if( claim_pos == claim_end ) { return h; } else if( !store_cache(h ) ) { assert( !"unable to cache something we asked for!" ); } } else // oops... I guess 3 tried to claim at once... { ++claim_pos; // drop it on the floor and let the // gc thread pick it up next time through the // ring buffer. } } if( found ) { fprintf( stderr, "apparently we were over drew the queue...\n" ); return fetch_cache(bin); // grab it from the cache this time. } } return nullptr; } char* malloc2( int s ) { return thread_allocator::get().alloc(s); } void free2( char* s ) { return thread_allocator::get().free(s); } #include "bench.cpp" ================================================ FILE: mmap_alloc.hpp ================================================ #pragma once #include extern "C" { #include #include #include #include #include #include } size_t pagesize() { return ::getpagesize(); } size_t page_count( size_t s ) { return static_cast< size_t >( ceilf( static_cast< float >( s) / pagesize() ) ); } char* mmap_alloc( size_t s, void* loc = 0 ) { //fprintf( stderr, "mmap_alloc %llu %p\n", s, loc ); const std::size_t pages( page_count(s) ); // add +1 for guard page std::size_t size_ = pages * pagesize(); # if defined(macintosh) || defined(__APPLE__) || defined(__APPLE_CC__) void* limit = ::mmap( loc, size_, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_PRIVATE | MAP_ANON, -1, 0); # else const int fd( ::open("/dev/zero", O_RDONLY) ); assert( -1 != fd); void* limit = ::mmap( loc, size_, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, 0); # endif if ( !limit ) throw std::bad_alloc(); return static_cast(limit); } void mmap_free( void* pos, size_t s ) { const std::size_t pages( page_count( s) ); // add +1 for guard page std::size_t size_ = pages * pagesize(); ::munmap( pos, size_); }