Repository: goossaert/kingdb Branch: master Commit: 58994280e789 Files: 75 Total size: 605.4 KB Directory structure: gitextract_ytklrwk_/ ├── .gitignore ├── 3rdparty/ │ ├── leveldb/ │ │ ├── AUTHORS │ │ └── LICENSE │ ├── lz4/ │ │ └── LICENSE │ ├── murmurhash3/ │ │ └── LICENSE │ ├── xxhash/ │ │ └── LICENSE │ └── zlib/ │ └── LICENSE ├── LICENSE ├── Makefile ├── README.md ├── algorithm/ │ ├── coding.cc │ ├── coding.h │ ├── compressor.cc │ ├── compressor.h │ ├── crc32c.cc │ ├── crc32c.h │ ├── endian.cc │ ├── endian.h │ ├── hash.cc │ ├── hash.h │ ├── lz4.cc │ ├── lz4.h │ ├── murmurhash3.cc │ ├── murmurhash3.h │ ├── xxhash.cc │ └── xxhash.h ├── cache/ │ ├── rate_limiter.h │ ├── write_buffer.cc │ └── write_buffer.h ├── doc/ │ ├── bench/ │ │ ├── benchmarks.md │ │ ├── db_bench_kingdb.cc │ │ └── generate_benchmarks_table.py │ ├── kingdb.md │ └── kingserver.md ├── include/ │ └── kingdb/ │ └── kdb.h ├── interface/ │ ├── database.cc │ ├── database.h │ ├── iterator.h │ ├── kingdb.h │ ├── multipart.h │ └── snapshot.h ├── network/ │ ├── client.h │ ├── client_main.cc │ ├── server.cc │ ├── server.h │ └── server_main.cc ├── storage/ │ ├── format.h │ ├── hstable_manager.h │ ├── resource_manager.h │ └── storage_engine.h ├── thread/ │ ├── event_manager.h │ ├── threadpool.h │ └── threadstorage.h ├── unit-tests/ │ ├── client_embedded.cc │ ├── dummy_interface.h │ ├── dummy_storage_engine.h │ ├── dummy_storage_engine_index.h │ ├── kingdb_user.cc │ ├── test_compression.cc │ ├── test_db.cc │ ├── testharness.cc │ └── testharness.h └── util/ ├── byte_array.h ├── config_parser.h ├── debug.cc ├── debug.h ├── file.h ├── filepool.h ├── logger.cc ├── logger.h ├── options.h ├── order.h ├── status.cc ├── status.h └── version.h ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Compiled Object files *.slo *.lo *.o *.obj # Compiled Dynamic libraries *.so *.dylib *.dll # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # Python *.pyc # Vim *~ *.*.swp ================================================ FILE: 3rdparty/leveldb/AUTHORS ================================================ # Names should be added to this file like so: # Name or Organization Google Inc. # Initial version authors: Jeffrey Dean Sanjay Ghemawat # Partial list of contributors: Kevin Regan Johan Bilien ================================================ FILE: 3rdparty/leveldb/LICENSE ================================================ Copyright (c) 2011 The LevelDB Authors. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: 3rdparty/lz4/LICENSE ================================================ LZ4 Library Copyright (c) 2011-2014, Yann Collet All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: 3rdparty/murmurhash3/LICENSE ================================================ MurmurHash3 was written by Austin Appleby, and is placed in the public domain. The author hereby disclaims copyright to this source code. ================================================ FILE: 3rdparty/xxhash/LICENSE ================================================ xxHash Library Copyright (c) 2012-2014, Yann Collet All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: 3rdparty/zlib/LICENSE ================================================ /* zlib.h -- interface of the 'zlib' general purpose compression library version 1.2.8, April 28th, 2013 Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. Jean-loup Gailly Mark Adler jloup@gzip.org madler@alumni.caltech.edu */ ================================================ FILE: LICENSE ================================================ Copyright (c) 2014, Emmanuel Goossaert All rights reserved. Use of KingDB is governed by the BSD 3-Clause License. KingDB uses/derives code from third-party open source projects. For the complete list of those projects and their respective licenses, refer to the end of the LICENSE file. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of KingDB nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ---------------------------------------------------------------- Third-party source code Below is the list of the projects from which KingDB derives code, and the location of their respective LICENSE files. - LevelDB, under BSD 3-Clause License: /3rdparty/leveldb/ - LZ4, under BSD 2-Clause License: /3rdparty/lz4/ - Murmurhash3, public domain: /3rdparty/murmurhash3/ - xxHash, under BSD 2-Clause License: /3rdparty/xxhash/ - zlib, under zlib/libpng License: /3rdparty/zlib/ ================================================ FILE: Makefile ================================================ CC=g++ INCLUDES=-I/usr/local/include/ -I/opt/local/include/ -I. -I./include/ LDFLAGS=-g -L/usr/local/lib/ -L/opt/local/lib/ -lpthread LDFLAGS_CLIENT=-g -L/usr/local/lib/ -L/opt/local/lib/ -lpthread -lmemcached -fPIC SOURCES=interface/database.cc util/logger.cc util/status.cc util/debug.cc network/server.cc cache/write_buffer.cc algorithm/endian.cc algorithm/compressor.cc algorithm/murmurhash3.cc algorithm/xxhash.cc algorithm/crc32c.cc algorithm/lz4.cc algorithm/hash.cc algorithm/coding.cc unit-tests/testharness.cc SOURCES_MAIN=network/server_main.cc SOURCES_CLIENT=network/client_main.cc SOURCES_CLIENT_EMB=unit-tests/client_embedded.cc SOURCES_TEST_COMPRESSION=unit-tests/test_compression.cc SOURCES_TEST_DB=unit-tests/test_db.cc OBJECTS=$(SOURCES:.cc=.o) OBJECTS_MAIN=$(SOURCES_MAIN:.cc=.o) OBJECTS_CLIENT=$(SOURCES_CLIENT:.cc=.o) OBJECTS_CLIENT_EMB=$(SOURCES_CLIENT_EMB:.cc=.o) OBJECTS_TEST_COMPRESSION=$(SOURCES_TEST_COMPRESSION:.cc=.o) OBJECTS_TEST_DB=$(SOURCES_TEST_DB:.cc=.o) EXECUTABLE=kingserver CLIENT_NETWORK=client_network CLIENT_EMB=client_emb TEST_COMPRESSION=test_compression TEST_DB=test_db LIBRARY=libkingdb.a PREFIX=/usr/local BINDIR=$(PREFIX)/bin INCLUDEDIR=$(PREFIX)/include/kingdb LIBDIR=$(PREFIX)/lib GCC_WALL=-Waddress -Warray-bounds=1 -Wc++11-compat -Wc++14-compat -Wchar-subscripts -Wenum-compare -Wimplicit-int -Wimplicit-function-declaration -Wcomment -Wformat -Wmain -Wmaybe-uninitialized -Wmissing-braces -Wnonnull -Wopenmp-simd -Wparentheses -Wpointer-sign -Wreorder -Wreturn-type -Wsequence-point -Wsign-compare -Wstrict-aliasing -Wstrict-overflow=1 -Wswitch -Wtrigraphs -Wuninitialized -Wunknown-pragmas -Wunused-function -Wunused-label -Wunused-value -Wunused-variable -Wvolatile-register-var CFLAGS=-Wall -std=c++11 -MMD -MP -c all: CFLAGS += -O2 all: $(SOURCES) $(LIBRARY) $(EXECUTABLE) $(CLIENT_EMB) $(TEST_COMPRESSION) $(TEST_DB) debug: CFLAGS += -DDEBUG -g debug: LDFLAGS+= -lprofiler debug: $(SOURCES) $(LIBRARY) $(EXECUTABLE) $(CLIENT_EMB) $(TEST_COMPRESSION) $(TEST_DB) client: CFLAGS += -O2 client: $(SOURCES) $(CLIENT_NETWORK) client-debug: CFLAGS += -DDEBUG -g client-debug: LDFLAGS_CLIENT += -lprofiler client-debug: $(SOURCES) $(CLIENT_NETWORK) threadsanitize: CFLAGS += -DDEBUG -g -fsanitize=thread -O2 -pie -fPIC threadsanitize: LDFLAGS += -pie -ltsan threadsanitize: LDFLAGS_CLIENT += -pie -ltsan threadsanitize: $(SOURCES) $(LIBRARY) $(EXECUTABLE) $(CLIENT_EMB) $(CLIENT_NETWORK) $(TEST_COMPRESSION) $(TEST_DB) $(EXECUTABLE): $(OBJECTS) $(OBJECTS_MAIN) $(CC) $(OBJECTS) $(OBJECTS_MAIN) -o $@ $(LDFLAGS) $(CLIENT_NETWORK): $(OBJECTS) $(OBJECTS_CLIENT) $(CC) $(OBJECTS) $(OBJECTS_CLIENT) -o $@ $(LDFLAGS_CLIENT) $(CLIENT_EMB): $(OBJECTS) $(OBJECTS_CLIENT_EMB) $(CC) $(OBJECTS) $(OBJECTS_CLIENT_EMB) -o $@ $(LDFLAGS) $(TEST_COMPRESSION): $(OBJECTS) $(OBJECTS_TEST_COMPRESSION) $(CC) $(OBJECTS) $(OBJECTS_TEST_COMPRESSION) -o $@ $(LDFLAGS) $(TEST_DB): $(OBJECTS) $(OBJECTS_TEST_DB) $(CC) $(OBJECTS) $(OBJECTS_TEST_DB) -o $@ $(LDFLAGS) $(LIBRARY): $(OBJECTS) rm -f $@ ar -rs $@ $(OBJECTS) install: $(EXECUTABLE) install $(EXECUTABLE) $(BINDIR) cp libkingdb.a $(LIBDIR) rm -rf $(INCLUDEDIR) mkdir $(INCLUDEDIR) $(INCLUDEDIR)/util $(INCLUDEDIR)/thread $(INCLUDEDIR)/interface $(INCLUDEDIR)/storage $(INCLUDEDIR)/algorithm $(INCLUDEDIR)/cache cp algorithm/*.h $(INCLUDEDIR)/algorithm cp cache/*.h $(INCLUDEDIR)/cache cp include/kingdb/*.h $(INCLUDEDIR) cp interface/*.h $(INCLUDEDIR)/interface cp util/*.h $(INCLUDEDIR)/util cp thread/*.h $(INCLUDEDIR)/thread cp storage/*.h $(INCLUDEDIR)/storage .cc.o: $(CC) $(CFLAGS) $(INCLUDES) $< -o $@ clean: rm -f $(EXECUTABLE) $(CLIENT_NETWORK) $(CLIENT_EMB) $(TEST_COMPRESSION) $(TEST_DB) $(LIBRARY) find . -name \.*.*.swp* -type f -print0 | xargs -0 rm -f find . -name \*.d -type f -print0 | xargs -0 rm -f find . -name \*.o -type f -print0 | xargs -0 rm -f find . -name \*~ -type f -print0 | xargs -0 rm -f find . -name \*-e -type f -print0 | xargs -0 rm -f -include $(SOURCES:%.cc=%.d) ================================================ FILE: README.md ================================================ KingDB ====== ###What is KingDB? **KingDB** is a fast on-disk persistent key-value store. You can embed it or use it as a library in your C++ applications. **KingServer** is a server application that embeds KingDB and implements the Memcached protocol. It allows you to access your data through a network interface using whatever programming language you want. KingServer is not a distributed system: the data lives in a single machine. **WARNING:** This is still alpha code. Even though unit-tests are covering the core components, make sure you run tests for your environment before using KingDB in production. ###Why use KingDB? - Fast for heavy write workloads and random reads. - The architecture, code, and data format are simple. - Multipart API to read and write large entries in smaller parts. - Multiple threads can access the same database safely. - Crash-proof: nothing ever gets overwritten. - Iterators and read-only consistent snapshots. - Compaction happens in a background thread, and does not block reads or writes. - The data format allows hot backups to be made. - Covered by unit tests. ###How fast is KingDB? KingDB was benchmarked using the same test suite as LevelDB. On a Linux CentOS 6.5, for entries with 16-byte keys and 100-byte values (50 bytes after compression), the performance was: | Workload | Operations per second | | ------------------: | :-------------------- | | Sequential reads | 104k | | Random reads | 203k | | Sequential writes | 233k | | Random writes | 251k | | Overwrite | 250k | For more details and a comparison with LevelDB, you can refer to the full [KingDB benchmarks](doc/bench/benchmarks.md). ###Where is the documentation? You can learn more in the [KingDB documentation](doc/kingdb.md) and the [KingServer documentation](doc/kingserver.md). ###How to install KingDB? You can find installation instructions in the [installation section](doc/kingdb.md#2-how-to-install-kingdb) of the documentation. KingDB has no external dependencies and has been tested on: - Mac OS X 10.9.5 with Apple LLVM version 6.0 (clang-600.0.51) - Linux Ubuntu 14.04 x64 with GCC 4.9.2 - Linux Ubuntu 15.04 x64 with GCC 4.9.2-10ubuntu13 - Linux CentOS 6.5 x86\_64 with GCC 4.9.2 If you are using GCC, update the Makefile and add \-fno\-builtin\-memcmp in the CFLAGS, and if you have tcmalloc on your system, add \-ltcmalloc to the LDFLAGS. This will give you a nice performance speed\-up. ###Where to get help? You can get help on the [KingDB mailing list](https://groups.google.com/d/forum/kingdb). To ask a question, simply [join the list](https://groups.google.com/d/forum/kingdb/join). ================================================ FILE: algorithm/coding.cc ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. // The code below was copied from LevelDB. A few changes were applied to make it // self-sufficient and part of KingDB. // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include "algorithm/coding.h" namespace kdb { void EncodeFixed32(char* buf, uint32_t value) { if (kdb::kLittleEndian) { memcpy(buf, &value, sizeof(value)); } else { buf[0] = value & 0xff; buf[1] = (value >> 8) & 0xff; buf[2] = (value >> 16) & 0xff; buf[3] = (value >> 24) & 0xff; } } void EncodeFixed64(char* buf, uint64_t value) { if (kdb::kLittleEndian) { memcpy(buf, &value, sizeof(value)); } else { buf[0] = value & 0xff; buf[1] = (value >> 8) & 0xff; buf[2] = (value >> 16) & 0xff; buf[3] = (value >> 24) & 0xff; buf[4] = (value >> 32) & 0xff; buf[5] = (value >> 40) & 0xff; buf[6] = (value >> 48) & 0xff; buf[7] = (value >> 56) & 0xff; } } void GetFixed32(const char* buf, uint32_t* value) { if (kdb::kLittleEndian) { memcpy(value, buf, sizeof(*value)); } else { // NOTE: this code has not been tested yet *value = (uint32_t)buf[0] | (uint32_t)buf[1] << 8 | (uint32_t)buf[2] << 16 | (uint32_t)buf[3] << 24; } } void GetFixed64(const char* buf, uint64_t* value) { if (kdb::kLittleEndian) { memcpy(value, buf, sizeof(*value)); } else { // NOTE: this code has not been tested yet *value = (uint64_t)buf[0] | (uint64_t)buf[1] << 8 | (uint64_t)buf[2] << 16 | (uint64_t)buf[3] << 24 | (uint64_t)buf[4] << 32 | (uint64_t)buf[5] << 40 | (uint64_t)buf[6] << 48 | (uint64_t)buf[7] << 56; } } void PutFixed32(std::string* dst, uint32_t value) { char buf[sizeof(value)]; EncodeFixed32(buf, value); dst->append(buf, sizeof(buf)); } void PutFixed64(std::string* dst, uint64_t value) { char buf[sizeof(value)]; EncodeFixed64(buf, value); dst->append(buf, sizeof(buf)); } char* EncodeVarint32(char* dst, uint32_t v) { // Operate on characters as unsigneds unsigned char* ptr = reinterpret_cast(dst); static const int B = 128; if (v < (1<<7)) { *(ptr++) = v; } else if (v < (1<<14)) { *(ptr++) = v | B; *(ptr++) = v>>7; } else if (v < (1<<21)) { *(ptr++) = v | B; *(ptr++) = (v>>7) | B; *(ptr++) = v>>14; } else if (v < (1<<28)) { *(ptr++) = v | B; *(ptr++) = (v>>7) | B; *(ptr++) = (v>>14) | B; *(ptr++) = v>>21; } else { *(ptr++) = v | B; *(ptr++) = (v>>7) | B; *(ptr++) = (v>>14) | B; *(ptr++) = (v>>21) | B; *(ptr++) = v>>28; } return reinterpret_cast(ptr); } void PutVarint32(std::string* dst, uint32_t v) { char buf[5]; char* ptr = EncodeVarint32(buf, v); dst->append(buf, ptr - buf); } char* EncodeVarint64(char* dst, uint64_t v) { static const int B = 128; unsigned char* ptr = reinterpret_cast(dst); while (v >= B) { *(ptr++) = (v & (B-1)) | B; v >>= 7; } *(ptr++) = static_cast(v); return reinterpret_cast(ptr); } void PutVarint64(std::string* dst, uint64_t v) { char buf[10]; char* ptr = EncodeVarint64(buf, v); dst->append(buf, ptr - buf); } int VarintLength(uint64_t v) { int len = 1; while (v >= 128) { v >>= 7; len++; } return len; } const char* GetVarint32PtrFallback(const char* p, const char* limit, uint32_t* value) { uint32_t result = 0; for (uint32_t shift = 0; shift <= 28 && p < limit; shift += 7) { uint32_t byte = *(reinterpret_cast(p)); p++; if (byte & 128) { // More bytes are present result |= ((byte & 127) << shift); } else { result |= (byte << shift); *value = result; return reinterpret_cast(p); } } return NULL; } int GetVarint32(char* data, uint64_t size, uint32_t* value) { const char* p = data; const char* limit = p + size; const char* q = GetVarint32Ptr(p, limit, value); if (q == NULL) { return -1; } else { return q - p; } } const char* GetVarint64Ptr(const char* p, const char* limit, uint64_t* value) { uint64_t result = 0; for (uint32_t shift = 0; shift <= 63 && p < limit; shift += 7) { uint64_t byte = *(reinterpret_cast(p)); p++; if (byte & 128) { // More bytes are present result |= ((byte & 127) << shift); } else { result |= (byte << shift); *value = result; return reinterpret_cast(p); } } return NULL; } int GetVarint64(char* data, uint64_t size, uint64_t* value) { const char* p = data; const char* limit = p + size; const char* q = GetVarint64Ptr(p, limit, value); if (q == NULL) { return -1; } else { return q - p; } } } // namespace kdb ================================================ FILE: algorithm/coding.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. // The code below was copied from LevelDB. A few changes were applied to make it // self-sufficient and part of KingDB. // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. // Endian-neutral encoding: // * Fixed-length numbers are encoded with least-significant byte first // * In addition we support variable length "varint" encoding // * Strings are encoded prefixed by their length in varint format #ifndef KINGDB_CODING_H_ #define KINGDB_CODING_H_ #include "util/debug.h" #include #include #include #include "algorithm/endian.h" #include "util/status.h" namespace kdb { // Standard Put... routines append to a string extern void PutFixed32(std::string* dst, uint32_t value); extern void PutFixed64(std::string* dst, uint64_t value); extern void PutVarint32(std::string* dst, uint32_t value); extern void PutVarint64(std::string* dst, uint64_t value); extern int GetVarint32(char* input, uint64_t size, uint32_t* value); extern int GetVarint64(char* input, uint64_t size, uint64_t* value); // Pointer-based variants of GetVarint... These either store a value // in *v and return a pointer just past the parsed value, or return // NULL on error. These routines only look at bytes in the range // [p..limit-1] extern const char* GetVarint32Ptr(const char* p,const char* limit, uint32_t* v); extern const char* GetVarint64Ptr(const char* p,const char* limit, uint64_t* v); // Returns the length of the varint32 or varint64 encoding of "v" extern int VarintLength(uint64_t v); // Lower-level versions of Put... that write directly into a character buffer // REQUIRES: dst has enough space for the value being written extern void EncodeFixed32(char* dst, uint32_t value); extern void EncodeFixed64(char* dst, uint64_t value); extern void GetFixed32(const char* src, uint32_t* value); extern void GetFixed64(const char* src, uint64_t* value); // Lower-level versions of Put... that write directly into a character buffer // and return a pointer just past the last byte written. // REQUIRES: dst has enough space for the value being written extern char* EncodeVarint32(char* dst, uint32_t value); extern char* EncodeVarint64(char* dst, uint64_t value); // Lower-level versions of Get... that read directly from a character buffer // without any bounds checking. inline uint32_t DecodeFixed32(const char* ptr) { if (kdb::kLittleEndian) { // Load the raw bytes uint32_t result; memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load return result; } else { return ((static_cast(static_cast(ptr[0]))) | (static_cast(static_cast(ptr[1])) << 8) | (static_cast(static_cast(ptr[2])) << 16) | (static_cast(static_cast(ptr[3])) << 24)); } } inline uint64_t DecodeFixed64(const char* ptr) { if (kdb::kLittleEndian) { // Load the raw bytes uint64_t result; memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load return result; } else { uint64_t lo = DecodeFixed32(ptr); uint64_t hi = DecodeFixed32(ptr + 4); return (hi << 32) | lo; } } // Internal routine for use by fallback path of GetVarint32Ptr extern const char* GetVarint32PtrFallback(const char* p, const char* limit, uint32_t* value); inline const char* GetVarint32Ptr(const char* p, const char* limit, uint32_t* value) { if (p < limit) { uint32_t result = *(reinterpret_cast(p)); if ((result & 128) == 0) { *value = result; return p + 1; } } return GetVarint32PtrFallback(p, limit, value); } } // namespace kdb #endif // KINGDB_CODING_H_ ================================================ FILE: algorithm/compressor.cc ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #include "algorithm/compressor.h" namespace kdb { void CompressorLZ4::ResetThreadLocalStorage() { ts_compress_.reset(); ts_uncompress_.reset(); } Status CompressorLZ4::Compress(char *source, uint64_t size_source, char **dest, uint64_t *size_dest) { /* if (size_source < 8) { *dest = nullptr; *size_dest = 0; return Status::OK(); } */ uint32_t bound = LZ4_compressBound(size_source); *size_dest = 0; *dest = new char[8 + bound]; int ret = LZ4_compress_limitedOutput(source, (*dest) + 8, size_source, bound); if (ret <= 0) { delete[] *dest; return Status::IOError("LZ4_compress_limitedOutput() failed"); } uint32_t size_compressed = ret + 8; uint32_t size_compressed_stored = size_compressed; // If the frame was compressed to a size larger than the original data, // we just copy the original data. if ((uint64_t)ret > size_source) { if (size_source > 8 + bound) { delete[] *dest; *dest = new char[8 + size_source]; } memcpy((*dest) + 8, source, size_source); size_compressed = size_source + 8; size_compressed_stored = 0; } // NOTE: Yes, storing 64 bits into 32, but overflows will not happens as // size_source is limited to db.storage.maximum_part_size uint32_t size_source_32 = size_source; EncodeFixed32((*dest), size_compressed_stored); EncodeFixed32((*dest) + 4, size_source_32); // NOTE: small entries don't need to have the compressed and source sizes // in front of the frame, this is just a waste of storage space. // Maybe have a special type of entry, like 'small' or 'self-contained', // which would indicate that the frame doesn't have the sizes. log::trace("CompressorLZ4::Compress()", "size_compressed:%u size_source:%u", size_compressed, size_source_32); uint64_t size_compressed_total = ts_compress_.get() + size_compressed; ts_compress_.put(size_compressed_total); *size_dest = size_compressed; return Status::OK(); } bool CompressorLZ4::IsUncompressionDone(uint64_t size_source_total) { uint64_t offset_uncompress = ts_uncompress_.get(); if (offset_uncompress == size_source_total) return true; return false; } Status CompressorLZ4::Uncompress(char *source, uint64_t size_source_total, char **dest, uint64_t *size_dest, char **frame_out, uint64_t *size_frame_out, bool do_memory_allocation // = true by dafault ) { uint64_t offset_uncompress = ts_uncompress_.get(); log::trace("CompressorLZ4::Uncompress()", "in %" PRIu64 " %" PRIu64, offset_uncompress, size_source_total); if (do_memory_allocation) { *dest = nullptr; } if (offset_uncompress == size_source_total) return Status::Done(); uint32_t size_source, size_compressed; GetFixed32(source + offset_uncompress, &size_compressed); GetFixed32(source + offset_uncompress + 4, &size_source); if (size_compressed > 0) { // the frame contains compressed data size_compressed -= 8; *size_dest = 0; if (do_memory_allocation) { *dest = new char[size_source]; } int size = size_compressed; log::trace("CompressorLZ4::Uncompress()", "ptr:%p size:%d size_source:%d offset:%" PRIu64, source + offset_uncompress + 8, size, size_source, offset_uncompress); int ret = LZ4_decompress_safe_partial(source + offset_uncompress + 8, *dest, size, size_source, size_source); if (ret <= 0) { if (do_memory_allocation) { delete[] (*dest); *dest = nullptr; } return Status::IOError("LZ4_decompress_safe_partial() failed"); } *size_dest = ret; } else { // the frame contains uncompressed data size_compressed = size_source; *size_dest = size_source; if (do_memory_allocation) { *dest = new char[size_source]; } memcpy(*dest, source + offset_uncompress + 8, size_source); } crc32_.stream(source + offset_uncompress, size_compressed + 8); *frame_out = source + offset_uncompress; *size_frame_out = size_compressed + 8; log::trace("CompressorLZ4::Uncompress()", "crc32:0x%" PRIx64 " frame_ptr:%p frame_size:%" PRIu64, crc32_.get(), *frame_out, *size_frame_out); offset_uncompress += size_compressed + 8; ts_uncompress_.put(offset_uncompress); log::trace("CompressorLZ4::Uncompress()", "out %" PRIu64 " %" PRIu64, offset_uncompress, size_source_total); return Status::OK(); } Status CompressorLZ4::UncompressByteArray(ByteArray& value, bool do_checksum_verification, ByteArray* value_uncompressed) { Status s; if (do_checksum_verification) { crc32_.ResetThreadLocalStorage(); crc32_.put(value.checksum_initial()); } bool is_compressed = value.is_compressed(); bool is_compression_disabled = false; uint64_t offset_in = 0; uint64_t offset_out = 0; ResetThreadLocalStorage(); *value_uncompressed = ByteArray::NewAllocatedMemoryByteArray(value.size()); value_uncompressed->set_size(value.size()); value_uncompressed->set_size_compressed(0); while (true) { if (is_compressed && !is_compression_disabled) { if (IsUncompressionDone(value.size_compressed())) { if ( !do_checksum_verification || crc32_.get() == value.checksum()) { log::debug("CompressorLZ4::UncompressByteArray()", "Good CRC32 - stored:0x%08" PRIx64 " computed:0x%08" PRIx64 "\n", value.checksum(), crc32_.get()); return Status::OK(); } else { log::debug("CompressorLZ4::UncompressByteArray()", "Bad CRC32 - stored:0x%08" PRIx64 " computed:0x%08" PRIx64 "\n", value.checksum(), crc32_.get()); return Status::IOError("Invalid checksum."); } } if (HasFrameHeaderDisabledCompression(value.data() + offset_in)) { log::debug("CompressorLZ4::UncompressByteArray()", "Finds that compression is disabled\n"); is_compression_disabled = true; if (do_checksum_verification) { crc32_.stream(value.data() + offset_in, size_frame_header()); } offset_in += size_frame_header(); } if (!is_compression_disabled) { char *frame; uint64_t size_frame; uint64_t size_out; char *buffer_out = value_uncompressed->data() + offset_out; log::trace("CompressorLZ4::UncompressByteArray()", "before uncompress"); Status s = Uncompress(value.data(), value.size_compressed(), &buffer_out, &size_out, &frame, &size_frame, false); //chunk_ = NewShallowCopyByteArray(data_out, size_out); if (s.IsDone()) { return Status::OK(); } else if (s.IsOK()) { if (do_checksum_verification) { crc32_.stream(frame, size_frame); } } else { return s; } offset_in += size_frame; offset_out += size_out; } } if (!is_compressed || is_compression_disabled) { log::trace("CompressorLZ4::UncompressByteArray()", "No compression or compression disabled"); uint64_t size_left; if (is_compressed && is_compression_disabled) { size_left = value.size_compressed(); } else { size_left = value.size(); } if (offset_in == size_left) { log::trace("CompressorLZ4::UncompressByteArray()", "Has gotten all the data"); return Status::OK(); } char* data_left = value.data() + offset_in; size_t step = 1024*1024; size_t size_current = offset_in + step < size_left ? step : size_left - offset_in; if (do_checksum_verification) { crc32_.stream(data_left, size_current); } memcpy(value_uncompressed->data() + offset_out, data_left, size_current); //chunk_ = value; //chunk_.increment_offset(offset); //chunk_.set_size(size_current); //chunk_.set_size_compressed(0); offset_in += size_current; offset_out += size_current; log::trace("CompressorLZ4::UncompressByteArray()", "Done with handling uncompressed data"); return Status::OK(); } } return true; } }; ================================================ FILE: algorithm/compressor.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_COMPRESSOR_H_ #define KINGDB_COMPRESSOR_H_ #include "util/debug.h" #include #include #include #include "algorithm/lz4.h" #include "util/logger.h" #include "util/status.h" #include "util/byte_array.h" #include "thread/threadstorage.h" #include "algorithm/crc32c.h" namespace kdb { /* Depending on whether or not compression is enabled, and if compression is * even possible (i.e. the data is not incompressible), the data stored for an * entry will be stored with the appropriate format. * * 1. Compression is disabled * - The space reserved on secondary storage for the entry's value is of the * exact size of the value itself, and the data of the value is stored * contiguously in that space. * - The entry is marked as being 'compacted', because it uses exactly the space * that was allocated for it. * - The checksum calculation is done over the raw data, i.e. the data of the * value itself. * * 2. Compression is enabled * - Data is stored in 'frames'. Each frame contains a chunk of compressed data, * with its own compression dictionary (depending on the compression algorithm * being used). * - The space reserved on secondary storage for the entry's value is of the * size of the raw value data, to which a padding is added. That padding must * be at least as large as the size of a frame header, which is 8 bytes. * - The entry is marked as being 'uncompacted', the space reserved on secondary * storage is likely to be more than the space the compressed value data is * actually using (in the case of incompressible data, the allocated space may * be just enough). * - Each frame is as follows: * Frame header: * * size_compressed: size of compressed data in bytes] (4 bytes, 32 * unsigned integer) * * size_raw: size of raw data in bytes] (4 bytes, 32 unsigned integer) * Frame data: * * compressed data (of size 'size_compressed' bytes) * - Each chunk of data is compressed, and if the size of the output byte array * is larger than the raw data, then the data is stored uncompressed: the * compressed data in the frame simply contains the raw data itself. And the * 'size_compressed' field in the frame header is set to 0 to indicate that * the data stored is the raw data. * - The checksum calculation is done over the frames, i.e. the compressed data * along with the frame headers. * * 3. The data or part of the data is incompresssible * - If the data is incompressible, i.e. the compressed data takes more space * than the raw data, then we have a problem. The solution is to allocate at * least the size of one frame header, which guarantees that in the worst * case, the can be stored as it is, uncompressed. * - As the compressed frames are being computed, the following check is * performed: * * size_frame_header = 8 * * size_remaining = (total size of raw value) * - (current offset in the raw value) * => size of raw data that is left to come * * size_chunk * => size of the chunk of raw value currently incoming * * space_left = (total size of raw value) * + (size of reserved padding) * - (offset where the next write will go on secondary storage) * => size of the space left on secondary storage where data can be * written for this entry. * * size_compressed: size of the current chunk of data after it was * compressed (including the frame header) * * if ( size_remaining - size_chunk + size_frame_header * > space_left - size_compressed) { * - cancel the compression of the current chunk * - disable compression for the rest of the streaming of this entry, * and store the rest of the incoming data as raw. * } * => This if-statement is testing, considering that the current chunk * is compressed, if there will be enough space to store the rest * of the entry as raw data when the next chunk comes in, including * a frame header. If not, then the compression needs to be disabled * immediately. Given that the padding is at least as large as the * size of a frame header, and that the test is performed over the * next chunk, any point of the compression stream will have enough * space to store the entry as raw data if it needs to. * - If compression is disabled mid-flight, the frame header is a sequence of * null bytes. That way, the decompression process will be able to identify * where compression was disabled, and copy the raw data directly on-ward. * */ class CompressorLZ4 { public: CompressorLZ4() { } // Added an empty copy assignment operator to avoid error messages of the type: // "object of type '...' cannot be assigned because its copy assignment // operator is implicitly deleted" CompressorLZ4& operator=(const CompressorLZ4& r) { if(&r == this) return *this; return *this; } virtual ~CompressorLZ4() { //log::emerg("CompressorLZ4()::dtor", "call"); } void ResetThreadLocalStorage(); Status Compress(char *raw_in, uint64_t size_raw_in, char **compressed_out, uint64_t *size_compressed_out ); bool IsUncompressionDone(uint64_t size_source); Status Uncompress(char *source, uint64_t size_source, char **dest, uint64_t *size_dest, char **frame_out, uint64_t *size_frame_out, bool do_memory_allocation=true ); Status UncompressByteArray(ByteArray& value, bool do_checksum_verification, ByteArray* value_uncompressed); void DisableCompressionInFrameHeader(char* frame) { for (uint64_t i = 0; i < size_frame_header(); i++) frame[i] = 0; } bool HasFrameHeaderDisabledCompression(char *frame) { for (uint64_t i = 0; i < size_frame_header(); i++) { if(frame[i] != 0) return false; } return true; } uint64_t thread_local_handler(std::map& status, std::mutex& mutex, uint64_t value, bool apply); uint64_t size_compressed() { return ts_compress_.get(); } uint64_t MaxInputSize() { return LZ4_MAX_INPUT_SIZE; } uint64_t size_frame_header() { return 8; }; uint64_t size_uncompressed_frame(uint64_t size_data) { return size_data + 8; } void AdjustCompressedSize(int64_t inc) { int64_t size = ts_compress_.get() + inc; ts_compress_.put(size); } private: ThreadStorage ts_compress_; ThreadStorage ts_uncompress_; CRC32 crc32_; }; }; #endif // KINGDB_COMPRESSOR_H_ ================================================ FILE: algorithm/crc32c.cc ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. // The code below was copied from zlib and LevelDB. A few changes were // applied to make it self-sufficient and part of KingDB. // zlib.h -- interface of the 'zlib' general purpose compression library // version 1.2.8, April 28th, 2013 // Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. // // A portable implementation of crc32c, optimized to handle // four bytes at a time. #include "algorithm/crc32c.h" #include namespace kdb { namespace crc32c { static const uint32_t table0_[256] = { 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351 }; static const uint32_t table1_[256] = { 0x00000000, 0x13a29877, 0x274530ee, 0x34e7a899, 0x4e8a61dc, 0x5d28f9ab, 0x69cf5132, 0x7a6dc945, 0x9d14c3b8, 0x8eb65bcf, 0xba51f356, 0xa9f36b21, 0xd39ea264, 0xc03c3a13, 0xf4db928a, 0xe7790afd, 0x3fc5f181, 0x2c6769f6, 0x1880c16f, 0x0b225918, 0x714f905d, 0x62ed082a, 0x560aa0b3, 0x45a838c4, 0xa2d13239, 0xb173aa4e, 0x859402d7, 0x96369aa0, 0xec5b53e5, 0xfff9cb92, 0xcb1e630b, 0xd8bcfb7c, 0x7f8be302, 0x6c297b75, 0x58ced3ec, 0x4b6c4b9b, 0x310182de, 0x22a31aa9, 0x1644b230, 0x05e62a47, 0xe29f20ba, 0xf13db8cd, 0xc5da1054, 0xd6788823, 0xac154166, 0xbfb7d911, 0x8b507188, 0x98f2e9ff, 0x404e1283, 0x53ec8af4, 0x670b226d, 0x74a9ba1a, 0x0ec4735f, 0x1d66eb28, 0x298143b1, 0x3a23dbc6, 0xdd5ad13b, 0xcef8494c, 0xfa1fe1d5, 0xe9bd79a2, 0x93d0b0e7, 0x80722890, 0xb4958009, 0xa737187e, 0xff17c604, 0xecb55e73, 0xd852f6ea, 0xcbf06e9d, 0xb19da7d8, 0xa23f3faf, 0x96d89736, 0x857a0f41, 0x620305bc, 0x71a19dcb, 0x45463552, 0x56e4ad25, 0x2c896460, 0x3f2bfc17, 0x0bcc548e, 0x186eccf9, 0xc0d23785, 0xd370aff2, 0xe797076b, 0xf4359f1c, 0x8e585659, 0x9dface2e, 0xa91d66b7, 0xbabffec0, 0x5dc6f43d, 0x4e646c4a, 0x7a83c4d3, 0x69215ca4, 0x134c95e1, 0x00ee0d96, 0x3409a50f, 0x27ab3d78, 0x809c2506, 0x933ebd71, 0xa7d915e8, 0xb47b8d9f, 0xce1644da, 0xddb4dcad, 0xe9537434, 0xfaf1ec43, 0x1d88e6be, 0x0e2a7ec9, 0x3acdd650, 0x296f4e27, 0x53028762, 0x40a01f15, 0x7447b78c, 0x67e52ffb, 0xbf59d487, 0xacfb4cf0, 0x981ce469, 0x8bbe7c1e, 0xf1d3b55b, 0xe2712d2c, 0xd69685b5, 0xc5341dc2, 0x224d173f, 0x31ef8f48, 0x050827d1, 0x16aabfa6, 0x6cc776e3, 0x7f65ee94, 0x4b82460d, 0x5820de7a, 0xfbc3faf9, 0xe861628e, 0xdc86ca17, 0xcf245260, 0xb5499b25, 0xa6eb0352, 0x920cabcb, 0x81ae33bc, 0x66d73941, 0x7575a136, 0x419209af, 0x523091d8, 0x285d589d, 0x3bffc0ea, 0x0f186873, 0x1cbaf004, 0xc4060b78, 0xd7a4930f, 0xe3433b96, 0xf0e1a3e1, 0x8a8c6aa4, 0x992ef2d3, 0xadc95a4a, 0xbe6bc23d, 0x5912c8c0, 0x4ab050b7, 0x7e57f82e, 0x6df56059, 0x1798a91c, 0x043a316b, 0x30dd99f2, 0x237f0185, 0x844819fb, 0x97ea818c, 0xa30d2915, 0xb0afb162, 0xcac27827, 0xd960e050, 0xed8748c9, 0xfe25d0be, 0x195cda43, 0x0afe4234, 0x3e19eaad, 0x2dbb72da, 0x57d6bb9f, 0x447423e8, 0x70938b71, 0x63311306, 0xbb8de87a, 0xa82f700d, 0x9cc8d894, 0x8f6a40e3, 0xf50789a6, 0xe6a511d1, 0xd242b948, 0xc1e0213f, 0x26992bc2, 0x353bb3b5, 0x01dc1b2c, 0x127e835b, 0x68134a1e, 0x7bb1d269, 0x4f567af0, 0x5cf4e287, 0x04d43cfd, 0x1776a48a, 0x23910c13, 0x30339464, 0x4a5e5d21, 0x59fcc556, 0x6d1b6dcf, 0x7eb9f5b8, 0x99c0ff45, 0x8a626732, 0xbe85cfab, 0xad2757dc, 0xd74a9e99, 0xc4e806ee, 0xf00fae77, 0xe3ad3600, 0x3b11cd7c, 0x28b3550b, 0x1c54fd92, 0x0ff665e5, 0x759baca0, 0x663934d7, 0x52de9c4e, 0x417c0439, 0xa6050ec4, 0xb5a796b3, 0x81403e2a, 0x92e2a65d, 0xe88f6f18, 0xfb2df76f, 0xcfca5ff6, 0xdc68c781, 0x7b5fdfff, 0x68fd4788, 0x5c1aef11, 0x4fb87766, 0x35d5be23, 0x26772654, 0x12908ecd, 0x013216ba, 0xe64b1c47, 0xf5e98430, 0xc10e2ca9, 0xd2acb4de, 0xa8c17d9b, 0xbb63e5ec, 0x8f844d75, 0x9c26d502, 0x449a2e7e, 0x5738b609, 0x63df1e90, 0x707d86e7, 0x0a104fa2, 0x19b2d7d5, 0x2d557f4c, 0x3ef7e73b, 0xd98eedc6, 0xca2c75b1, 0xfecbdd28, 0xed69455f, 0x97048c1a, 0x84a6146d, 0xb041bcf4, 0xa3e32483 }; static const uint32_t table2_[256] = { 0x00000000, 0xa541927e, 0x4f6f520d, 0xea2ec073, 0x9edea41a, 0x3b9f3664, 0xd1b1f617, 0x74f06469, 0x38513ec5, 0x9d10acbb, 0x773e6cc8, 0xd27ffeb6, 0xa68f9adf, 0x03ce08a1, 0xe9e0c8d2, 0x4ca15aac, 0x70a27d8a, 0xd5e3eff4, 0x3fcd2f87, 0x9a8cbdf9, 0xee7cd990, 0x4b3d4bee, 0xa1138b9d, 0x045219e3, 0x48f3434f, 0xedb2d131, 0x079c1142, 0xa2dd833c, 0xd62de755, 0x736c752b, 0x9942b558, 0x3c032726, 0xe144fb14, 0x4405696a, 0xae2ba919, 0x0b6a3b67, 0x7f9a5f0e, 0xdadbcd70, 0x30f50d03, 0x95b49f7d, 0xd915c5d1, 0x7c5457af, 0x967a97dc, 0x333b05a2, 0x47cb61cb, 0xe28af3b5, 0x08a433c6, 0xade5a1b8, 0x91e6869e, 0x34a714e0, 0xde89d493, 0x7bc846ed, 0x0f382284, 0xaa79b0fa, 0x40577089, 0xe516e2f7, 0xa9b7b85b, 0x0cf62a25, 0xe6d8ea56, 0x43997828, 0x37691c41, 0x92288e3f, 0x78064e4c, 0xdd47dc32, 0xc76580d9, 0x622412a7, 0x880ad2d4, 0x2d4b40aa, 0x59bb24c3, 0xfcfab6bd, 0x16d476ce, 0xb395e4b0, 0xff34be1c, 0x5a752c62, 0xb05bec11, 0x151a7e6f, 0x61ea1a06, 0xc4ab8878, 0x2e85480b, 0x8bc4da75, 0xb7c7fd53, 0x12866f2d, 0xf8a8af5e, 0x5de93d20, 0x29195949, 0x8c58cb37, 0x66760b44, 0xc337993a, 0x8f96c396, 0x2ad751e8, 0xc0f9919b, 0x65b803e5, 0x1148678c, 0xb409f5f2, 0x5e273581, 0xfb66a7ff, 0x26217bcd, 0x8360e9b3, 0x694e29c0, 0xcc0fbbbe, 0xb8ffdfd7, 0x1dbe4da9, 0xf7908dda, 0x52d11fa4, 0x1e704508, 0xbb31d776, 0x511f1705, 0xf45e857b, 0x80aee112, 0x25ef736c, 0xcfc1b31f, 0x6a802161, 0x56830647, 0xf3c29439, 0x19ec544a, 0xbcadc634, 0xc85da25d, 0x6d1c3023, 0x8732f050, 0x2273622e, 0x6ed23882, 0xcb93aafc, 0x21bd6a8f, 0x84fcf8f1, 0xf00c9c98, 0x554d0ee6, 0xbf63ce95, 0x1a225ceb, 0x8b277743, 0x2e66e53d, 0xc448254e, 0x6109b730, 0x15f9d359, 0xb0b84127, 0x5a968154, 0xffd7132a, 0xb3764986, 0x1637dbf8, 0xfc191b8b, 0x595889f5, 0x2da8ed9c, 0x88e97fe2, 0x62c7bf91, 0xc7862def, 0xfb850ac9, 0x5ec498b7, 0xb4ea58c4, 0x11abcaba, 0x655baed3, 0xc01a3cad, 0x2a34fcde, 0x8f756ea0, 0xc3d4340c, 0x6695a672, 0x8cbb6601, 0x29faf47f, 0x5d0a9016, 0xf84b0268, 0x1265c21b, 0xb7245065, 0x6a638c57, 0xcf221e29, 0x250cde5a, 0x804d4c24, 0xf4bd284d, 0x51fcba33, 0xbbd27a40, 0x1e93e83e, 0x5232b292, 0xf77320ec, 0x1d5de09f, 0xb81c72e1, 0xccec1688, 0x69ad84f6, 0x83834485, 0x26c2d6fb, 0x1ac1f1dd, 0xbf8063a3, 0x55aea3d0, 0xf0ef31ae, 0x841f55c7, 0x215ec7b9, 0xcb7007ca, 0x6e3195b4, 0x2290cf18, 0x87d15d66, 0x6dff9d15, 0xc8be0f6b, 0xbc4e6b02, 0x190ff97c, 0xf321390f, 0x5660ab71, 0x4c42f79a, 0xe90365e4, 0x032da597, 0xa66c37e9, 0xd29c5380, 0x77ddc1fe, 0x9df3018d, 0x38b293f3, 0x7413c95f, 0xd1525b21, 0x3b7c9b52, 0x9e3d092c, 0xeacd6d45, 0x4f8cff3b, 0xa5a23f48, 0x00e3ad36, 0x3ce08a10, 0x99a1186e, 0x738fd81d, 0xd6ce4a63, 0xa23e2e0a, 0x077fbc74, 0xed517c07, 0x4810ee79, 0x04b1b4d5, 0xa1f026ab, 0x4bdee6d8, 0xee9f74a6, 0x9a6f10cf, 0x3f2e82b1, 0xd50042c2, 0x7041d0bc, 0xad060c8e, 0x08479ef0, 0xe2695e83, 0x4728ccfd, 0x33d8a894, 0x96993aea, 0x7cb7fa99, 0xd9f668e7, 0x9557324b, 0x3016a035, 0xda386046, 0x7f79f238, 0x0b899651, 0xaec8042f, 0x44e6c45c, 0xe1a75622, 0xdda47104, 0x78e5e37a, 0x92cb2309, 0x378ab177, 0x437ad51e, 0xe63b4760, 0x0c158713, 0xa954156d, 0xe5f54fc1, 0x40b4ddbf, 0xaa9a1dcc, 0x0fdb8fb2, 0x7b2bebdb, 0xde6a79a5, 0x3444b9d6, 0x91052ba8 }; static const uint32_t table3_[256] = { 0x00000000, 0xdd45aab8, 0xbf672381, 0x62228939, 0x7b2231f3, 0xa6679b4b, 0xc4451272, 0x1900b8ca, 0xf64463e6, 0x2b01c95e, 0x49234067, 0x9466eadf, 0x8d665215, 0x5023f8ad, 0x32017194, 0xef44db2c, 0xe964b13d, 0x34211b85, 0x560392bc, 0x8b463804, 0x924680ce, 0x4f032a76, 0x2d21a34f, 0xf06409f7, 0x1f20d2db, 0xc2657863, 0xa047f15a, 0x7d025be2, 0x6402e328, 0xb9474990, 0xdb65c0a9, 0x06206a11, 0xd725148b, 0x0a60be33, 0x6842370a, 0xb5079db2, 0xac072578, 0x71428fc0, 0x136006f9, 0xce25ac41, 0x2161776d, 0xfc24ddd5, 0x9e0654ec, 0x4343fe54, 0x5a43469e, 0x8706ec26, 0xe524651f, 0x3861cfa7, 0x3e41a5b6, 0xe3040f0e, 0x81268637, 0x5c632c8f, 0x45639445, 0x98263efd, 0xfa04b7c4, 0x27411d7c, 0xc805c650, 0x15406ce8, 0x7762e5d1, 0xaa274f69, 0xb327f7a3, 0x6e625d1b, 0x0c40d422, 0xd1057e9a, 0xaba65fe7, 0x76e3f55f, 0x14c17c66, 0xc984d6de, 0xd0846e14, 0x0dc1c4ac, 0x6fe34d95, 0xb2a6e72d, 0x5de23c01, 0x80a796b9, 0xe2851f80, 0x3fc0b538, 0x26c00df2, 0xfb85a74a, 0x99a72e73, 0x44e284cb, 0x42c2eeda, 0x9f874462, 0xfda5cd5b, 0x20e067e3, 0x39e0df29, 0xe4a57591, 0x8687fca8, 0x5bc25610, 0xb4868d3c, 0x69c32784, 0x0be1aebd, 0xd6a40405, 0xcfa4bccf, 0x12e11677, 0x70c39f4e, 0xad8635f6, 0x7c834b6c, 0xa1c6e1d4, 0xc3e468ed, 0x1ea1c255, 0x07a17a9f, 0xdae4d027, 0xb8c6591e, 0x6583f3a6, 0x8ac7288a, 0x57828232, 0x35a00b0b, 0xe8e5a1b3, 0xf1e51979, 0x2ca0b3c1, 0x4e823af8, 0x93c79040, 0x95e7fa51, 0x48a250e9, 0x2a80d9d0, 0xf7c57368, 0xeec5cba2, 0x3380611a, 0x51a2e823, 0x8ce7429b, 0x63a399b7, 0xbee6330f, 0xdcc4ba36, 0x0181108e, 0x1881a844, 0xc5c402fc, 0xa7e68bc5, 0x7aa3217d, 0x52a0c93f, 0x8fe56387, 0xedc7eabe, 0x30824006, 0x2982f8cc, 0xf4c75274, 0x96e5db4d, 0x4ba071f5, 0xa4e4aad9, 0x79a10061, 0x1b838958, 0xc6c623e0, 0xdfc69b2a, 0x02833192, 0x60a1b8ab, 0xbde41213, 0xbbc47802, 0x6681d2ba, 0x04a35b83, 0xd9e6f13b, 0xc0e649f1, 0x1da3e349, 0x7f816a70, 0xa2c4c0c8, 0x4d801be4, 0x90c5b15c, 0xf2e73865, 0x2fa292dd, 0x36a22a17, 0xebe780af, 0x89c50996, 0x5480a32e, 0x8585ddb4, 0x58c0770c, 0x3ae2fe35, 0xe7a7548d, 0xfea7ec47, 0x23e246ff, 0x41c0cfc6, 0x9c85657e, 0x73c1be52, 0xae8414ea, 0xcca69dd3, 0x11e3376b, 0x08e38fa1, 0xd5a62519, 0xb784ac20, 0x6ac10698, 0x6ce16c89, 0xb1a4c631, 0xd3864f08, 0x0ec3e5b0, 0x17c35d7a, 0xca86f7c2, 0xa8a47efb, 0x75e1d443, 0x9aa50f6f, 0x47e0a5d7, 0x25c22cee, 0xf8878656, 0xe1873e9c, 0x3cc29424, 0x5ee01d1d, 0x83a5b7a5, 0xf90696d8, 0x24433c60, 0x4661b559, 0x9b241fe1, 0x8224a72b, 0x5f610d93, 0x3d4384aa, 0xe0062e12, 0x0f42f53e, 0xd2075f86, 0xb025d6bf, 0x6d607c07, 0x7460c4cd, 0xa9256e75, 0xcb07e74c, 0x16424df4, 0x106227e5, 0xcd278d5d, 0xaf050464, 0x7240aedc, 0x6b401616, 0xb605bcae, 0xd4273597, 0x09629f2f, 0xe6264403, 0x3b63eebb, 0x59416782, 0x8404cd3a, 0x9d0475f0, 0x4041df48, 0x22635671, 0xff26fcc9, 0x2e238253, 0xf36628eb, 0x9144a1d2, 0x4c010b6a, 0x5501b3a0, 0x88441918, 0xea669021, 0x37233a99, 0xd867e1b5, 0x05224b0d, 0x6700c234, 0xba45688c, 0xa345d046, 0x7e007afe, 0x1c22f3c7, 0xc167597f, 0xc747336e, 0x1a0299d6, 0x782010ef, 0xa565ba57, 0xbc65029d, 0x6120a825, 0x0302211c, 0xde478ba4, 0x31035088, 0xec46fa30, 0x8e647309, 0x5321d9b1, 0x4a21617b, 0x9764cbc3, 0xf54642fa, 0x2803e842 }; // Used to fetch a naturally-aligned 32-bit word in little endian byte-order static inline uint32_t LE_LOAD32(const uint8_t *p) { return DecodeFixed32(reinterpret_cast(p)); } uint32_t Extend(uint32_t crc, const char* buf, size_t size) { const uint8_t *p = reinterpret_cast(buf); const uint8_t *e = p + size; uint32_t l = crc ^ 0xffffffffu; #define STEP1 do { \ int c = (l & 0xff) ^ *p++; \ l = table0_[c] ^ (l >> 8); \ } while (0) #define STEP4 do { \ uint32_t c = l ^ LE_LOAD32(p); \ p += 4; \ l = table3_[c & 0xff] ^ \ table2_[(c >> 8) & 0xff] ^ \ table1_[(c >> 16) & 0xff] ^ \ table0_[c >> 24]; \ } while (0) // Point x at first 4-byte aligned byte in string. This might be // just past the end of the string. const uintptr_t pval = reinterpret_cast(p); const uint8_t* x = reinterpret_cast(((pval + 3) >> 2) << 2); if (x <= e) { // Process bytes until finished or p is 4-byte aligned while (p != x) { STEP1; } } // Process bytes 16 at a time while ((e-p) >= 16) { STEP4; STEP4; STEP4; STEP4; } // Process bytes 4 at a time while ((e-p) >= 4) { STEP4; } // Process the last few bytes while (p != e) { STEP1; } #undef STEP4 #undef STEP1 return l ^ 0xffffffffu; } // For crc32_combine ulong gf2_matrix_times (ulong *mat, ulong vec) { ulong sum = 0; while (vec) { if (vec & 1) sum ^= *mat; vec >>= 1; mat++; } return sum; } void gf2_matrix_square (ulong *square, ulong *mat) { int n; for (n = 0; n < GF2_DIM; n++) square[n] = gf2_matrix_times(mat, mat[n]); } ulong Combine(ulong crc1, ulong crc2, ulong len2) { // WARNING: // This method is *very* slow. It was used to combine the crc32 of the // header and the crc32 of the content of an entry, and after doing some // profiling, it turned out that Combine() was accountable for 40% of all // the CPU usage in KingDB during write-intensive workloads. The data format // was then changed appropriately so that the use of Combine() would no // longer be needed. If Combine() needs to be used again, its usage must // *absolutely* be carefully profiled and monitored, as it can have a // dramatic impact on performance. // // NOTE: The original zlib code had a polynomial 'odd[0]' equal to // 0xedb88320L, which is the value for the CRC32 checksum. // I changed it for 0x82f63b78, which is the reversed polynomial for // the CRC32C (Castagnoli) checksum, which is the one used in the // code that I copied from LevelDB. The polynomial was found in: // http://en.wikipedia.org/wiki/Cyclic_redundancy_check int n; ulong row; ulong even[GF2_DIM]; /* even-power-of-two zeros operator */ ulong odd[GF2_DIM]; /* odd-power-of-two zeros operator */ /* degenerate case */ if (len2 == 0) return crc1; /* put operator for one zero bit in odd */ odd[0] = 0x82f63b78; /* CRC-32 polynomial */ row = 1; for (n = 1; n < GF2_DIM; n++) { odd[n] = row; row <<= 1; } /* put operator for two zero bits in even */ gf2_matrix_square(even, odd); /* put operator for four zero bits in odd */ gf2_matrix_square(odd, even); /* apply len2 zeros to crc1 (first square will put the operator for one zero byte, eight zero bits, in even) */ do { /* apply zeros operator for this bit of len2 */ gf2_matrix_square(even, odd); if (len2 & 1) crc1 = gf2_matrix_times(even, crc1); len2 >>= 1; /* if no more bits set, then done */ if (len2 == 0) break; /* another iteration of the loop with odd and even swapped */ gf2_matrix_square(odd, even); if (len2 & 1) crc1 = gf2_matrix_times(odd, crc1); len2 >>= 1; /* if no more bits set, then done */ } while (len2 != 0); /* return combined crc */ crc1 ^= crc2; return crc1; } /* 8-bit CRC with polynomial x^8+x^6+x^3+x^2+1, 0x14D. Chosen based on Koopman, et al. (0xA6 in his notation = 0x14D >> 1): http://www.ece.cmu.edu/~koopman/roses/dsn04/koopman04_crc_poly_embedded.pdf */ static unsigned char crc8_table[] = { 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, 0x95, 0xab, 0xe9, 0xd7, 0x6d, 0x53, 0x11, 0x2f, 0x4f, 0x71, 0x33, 0x0d, 0xb7, 0x89, 0xcb, 0xf5, 0xda, 0xe4, 0xa6, 0x98, 0x22, 0x1c, 0x5e, 0x60, 0x9e, 0xa0, 0xe2, 0xdc, 0x66, 0x58, 0x1a, 0x24, 0x0b, 0x35, 0x77, 0x49, 0xf3, 0xcd, 0x8f, 0xb1, 0xd1, 0xef, 0xad, 0x93, 0x29, 0x17, 0x55, 0x6b, 0x44, 0x7a, 0x38, 0x06, 0xbc, 0x82, 0xc0, 0xfe, 0x59, 0x67, 0x25, 0x1b, 0xa1, 0x9f, 0xdd, 0xe3, 0xcc, 0xf2, 0xb0, 0x8e, 0x34, 0x0a, 0x48, 0x76, 0x16, 0x28, 0x6a, 0x54, 0xee, 0xd0, 0x92, 0xac, 0x83, 0xbd, 0xff, 0xc1, 0x7b, 0x45, 0x07, 0x39, 0xc7, 0xf9, 0xbb, 0x85, 0x3f, 0x01, 0x43, 0x7d, 0x52, 0x6c, 0x2e, 0x10, 0xaa, 0x94, 0xd6, 0xe8, 0x88, 0xb6, 0xf4, 0xca, 0x70, 0x4e, 0x0c, 0x32, 0x1d, 0x23, 0x61, 0x5f, 0xe5, 0xdb, 0x99, 0xa7, 0xb2, 0x8c, 0xce, 0xf0, 0x4a, 0x74, 0x36, 0x08, 0x27, 0x19, 0x5b, 0x65, 0xdf, 0xe1, 0xa3, 0x9d, 0xfd, 0xc3, 0x81, 0xbf, 0x05, 0x3b, 0x79, 0x47, 0x68, 0x56, 0x14, 0x2a, 0x90, 0xae, 0xec, 0xd2, 0x2c, 0x12, 0x50, 0x6e, 0xd4, 0xea, 0xa8, 0x96, 0xb9, 0x87, 0xc5, 0xfb, 0x41, 0x7f, 0x3d, 0x03, 0x63, 0x5d, 0x1f, 0x21, 0x9b, 0xa5, 0xe7, 0xd9, 0xf6, 0xc8, 0x8a, 0xb4, 0x0e, 0x30, 0x72, 0x4c, 0xeb, 0xd5, 0x97, 0xa9, 0x13, 0x2d, 0x6f, 0x51, 0x7e, 0x40, 0x02, 0x3c, 0x86, 0xb8, 0xfa, 0xc4, 0xa4, 0x9a, 0xd8, 0xe6, 0x5c, 0x62, 0x20, 0x1e, 0x31, 0x0f, 0x4d, 0x73, 0xc9, 0xf7, 0xb5, 0x8b, 0x75, 0x4b, 0x09, 0x37, 0x8d, 0xb3, 0xf1, 0xcf, 0xe0, 0xde, 0x9c, 0xa2, 0x18, 0x26, 0x64, 0x5a, 0x3a, 0x04, 0x46, 0x78, 0xc2, 0xfc, 0xbe, 0x80, 0xaf, 0x91, 0xd3, 0xed, 0x57, 0x69, 0x2b, 0x15}; uint8_t crc8(unsigned crc, unsigned char *data, size_t len) { unsigned char *end; if (len == 0) return crc; crc ^= 0xff; end = data + len; do { crc = crc8_table[crc ^ *data++]; } while (data < end); return crc ^ 0xff; } uint8_t crc8(unsigned crc, char *data, size_t len) { return crc8(crc, reinterpret_cast(data), len); } } // namespace crc32c } // namespace kdb ================================================ FILE: algorithm/crc32c.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. // The code below was copied from zlib and LevelDB. A few changes were // applied to make it self-sufficient and part of KingDB. // zlib.h -- interface of the 'zlib' general purpose compression library // version 1.2.8, April 28th, 2013 // Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #ifndef KINGDB_CRC32_H_ #define KINGDB_CRC32_H_ #include "util/debug.h" #include #include #include #include "util/logger.h" #include "algorithm/endian.h" #include "algorithm/coding.h" #include "thread/threadstorage.h" namespace kdb { namespace crc32c { // Return the crc32c of concat(A, data[0,n-1]) where init_crc is the // crc32c of some string A. Extend() is often used to maintain the // crc32c of a stream of data. extern uint32_t Extend(uint32_t init_crc, const char* data, size_t n); // Return the crc32c of data[0,n-1] inline uint32_t Value(const char* data, size_t n) { return Extend(0, data, n); } static const uint32_t kMaskDelta = 0xa282ead8ul; // Return a masked representation of crc. // // Motivation: it is problematic to compute the CRC of a string that // contains embedded CRCs. Therefore we recommend that CRCs stored // somewhere (e.g., in files) should be masked before being stored. inline uint32_t Mask(uint32_t crc) { // Rotate right by 15 bits and add a constant. return ((crc >> 15) | (crc << 17)) + kMaskDelta; } // Return the crc whose masked representation is masked_crc. inline uint32_t Unmask(uint32_t masked_crc) { uint32_t rot = masked_crc - kMaskDelta; return ((rot >> 17) | (rot << 15)); } // For crc32_combine typedef uint32_t ulong; typedef int64_t I64; ulong Combine(ulong crc1, ulong crc2, ulong len2); #define GF2_DIM 32 // 8-bit CRC uint8_t crc8(unsigned crc, unsigned char *data, size_t len); uint8_t crc8(unsigned crc, char *data, size_t len); } // namespace crc32c class CRC32 { public: CRC32() {} ~CRC32() { } // Added an empty copy assignment operator to avoid error messages of the type: // "object of type '...' cannot be assigned because its copy assignment // operator is implicitly deleted" CRC32& operator=(const CRC32& r) { if(&r == this) return *this; return *this; } void stream(const char* data, size_t n) { //log::trace("CRC32", "size: %zu", n); uint64_t c = ts_.get(); uint32_t c32 = c; uint32_t c_new = crc32c::Extend(c32, data, n); ts_.put(c_new); } uint32_t get() { return ts_.get(); } void put(uint32_t c32) { ts_.put(c32); } void ResetThreadLocalStorage() { ts_.reset(); } virtual uint64_t MaxInputSize() { return std::numeric_limits::max(); } private: kdb::ThreadStorage ts_; }; } // namespace kdb #endif // KINGDB_CRC32_H_ ================================================ FILE: algorithm/endian.cc ================================================ #include "algorithm/endian.h" namespace kdb { endian_t getEndianness() { if ((0xffffffff & 1) == kBytesLittleEndian) { return kBytesLittleEndian; } else if ((0xffffffff & 1) == kBytesBigEndian) { return kBytesBigEndian; } else if ((0xffffffff & 1) == kBytesLittleEndianWord) { return kBytesLittleEndianWord; } else if ((0xffffffff & 1) == kBytesBigEndianWord) { return kBytesBigEndianWord; } return kBytesUnknownEndian; } const bool kLittleEndian = (getEndianness() == kBytesLittleEndian); const bool kBigEndian = (getEndianness() == kBytesBigEndian); } // namespace kdb ================================================ FILE: algorithm/endian.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_ENDIAN_H_ #define KINGDB_ENDIAN_H_ #include "util/debug.h" #include namespace kdb { enum endian_t : uint32_t { kBytesLittleEndian = 0x00000001, // byte-swapped little-endian kBytesBigEndian = 0x01000000, // byte-swapped big-endian kBytesLittleEndianWord = 0x00010000, // word-swapped little-endian kBytesBigEndianWord = 0x00000100, // word-swapped big-endian kBytesUnknownEndian = 0xffffffff }; //TODO: linux compatibility /* constexpr endian_t getEndianness() { if ((0xffffffff & 1) == kBytesLittleEndian) { return kBytesLittleEndian; } else if ((0xffffffff & 1) == kBytesBigEndian) { return kBytesBigEndian; } else if ((0xffffffff & 1) == kBytesLittleEndianWord) { return kBytesLittleEndianWord; } else if ((0xffffffff & 1) == kBytesBigEndianWord) { return kBytesBigEndianWord; } return kBytesUnknownEndian; } */ endian_t getEndianness(); extern const bool kLittleEndian; extern const bool kBigEndian; }; #endif // KINGDB_ENDIAN_H_ ================================================ FILE: algorithm/hash.cc ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #include "algorithm/hash.h" namespace kdb { uint64_t MurmurHash3::HashFunction(const char *data, uint32_t len) { // NOTE: You may need to change the seed, which by default is 0 static char hash[16]; static uint64_t output; // NOTE: Beware, the len in MurmurHash3_x64_128 is an 'int', not a 'uint32_t' MurmurHash3_x64_128(data, len, 0, hash); memcpy(&output, hash, 8); return output; } uint64_t xxHash::HashFunction(const char *data, uint32_t len) { // NOTE: You may need to change the seed, which by default is 0 return XXH64(data, len, 0); } Hash* MakeHash(HashType ht) { if (ht == kMurmurHash3_64) { return new MurmurHash3(); } else if (ht == kxxHash_64) { return new xxHash(); } else { log::emerg("Hash", "Unknown hashing function: [%d]", ht); exit(-1); } } } // namespace kdb ================================================ FILE: algorithm/hash.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_HASH_H_ #define KINGDB_HASH_H_ #include "util/debug.h" #include #include #include #include #include "util/logger.h" #include "util/options.h" #include "algorithm/murmurhash3.h" #include "algorithm/xxhash.h" namespace kdb { class Hash { public: Hash() {} virtual ~Hash() {} virtual uint64_t HashFunction(const char *data, uint32_t len) = 0; virtual uint64_t MaxInputSize() = 0; }; class MurmurHash3: public Hash { public: MurmurHash3() {} virtual ~MurmurHash3() {} virtual uint64_t HashFunction(const char *data, uint32_t len); virtual uint64_t MaxInputSize() { return std::numeric_limits::max(); } }; class xxHash: public Hash { public: xxHash() {} virtual ~xxHash() {} virtual uint64_t HashFunction(const char *data, uint32_t len); virtual uint64_t MaxInputSize() { return std::numeric_limits::max(); } }; Hash* MakeHash(HashType ht); } // namespace kdb #endif // KINGDB_HASH_H_ ================================================ FILE: algorithm/lz4.cc ================================================ /* LZ4 - Fast LZ compression algorithm Copyright (C) 2011-2014, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - LZ4 source repository : http://code.google.com/p/lz4/ - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c */ /************************************** Tuning parameters **************************************/ /* * HEAPMODE : * Select how default compression functions will allocate memory for their hash table, * in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)). */ #define HEAPMODE 0 /************************************** CPU Feature Detection **************************************/ /* 32 or 64 bits ? */ #if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \ || defined(__powerpc64__) || defined(__powerpc64le__) \ || defined(__ppc64__) || defined(__ppc64le__) \ || defined(__PPC64__) || defined(__PPC64LE__) \ || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) ) /* Detects 64 bits mode */ # define LZ4_ARCH64 1 #else # define LZ4_ARCH64 0 #endif #define LZ4_32BITS (sizeof(void*)==4) #define LZ4_64BITS (sizeof(void*)==8) /* * Little Endian or Big Endian ? * Overwrite the #define below if you know your architecture endianess */ #include /* Apparently required to detect endianess */ #if defined (__GLIBC__) # include # if (__BYTE_ORDER == __BIG_ENDIAN) # define LZ4_BIG_ENDIAN 1 # endif #elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN)) # define LZ4_BIG_ENDIAN 1 #elif defined(__sparc) || defined(__sparc__) \ || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \ || defined(__hpux) || defined(__hppa) \ || defined(_MIPSEB) || defined(__s390__) # define LZ4_BIG_ENDIAN 1 #else /* Little Endian assumed. PDP Endian and other very rare endian format are unsupported. */ #endif /* * Unaligned memory access is automatically enabled for "common" CPU, such as x86. * For others CPU, such as ARM, the compiler may be more cautious, inserting unnecessary extra code to ensure aligned access property * If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance */ #if defined(__ARM_FEATURE_UNALIGNED) # define LZ4_FORCE_UNALIGNED_ACCESS 1 #endif /* Define this parameter if your target system or compiler does not support hardware bit count */ #if defined(_MSC_VER) && defined(_WIN32_WCE) /* Visual Studio for Windows CE does not support Hardware bit count */ # define LZ4_FORCE_SW_BITCOUNT #endif /* * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE : * This option may provide a small boost to performance for some big endian cpu, although probably modest. * You may set this option to 1 if data will remain within closed environment. * This option is useless on Little_Endian CPU (such as x86) */ /* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */ /************************************** Compiler Options **************************************/ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ /* "restrict" is a known keyword */ #else # define restrict /* Disable restrict */ #endif #ifdef _MSC_VER /* Visual Studio */ # define FORCE_INLINE static __forceinline # include /* For Visual 2005 */ # if LZ4_ARCH64 /* 64-bits */ # pragma intrinsic(_BitScanForward64) /* For Visual 2005 */ # pragma intrinsic(_BitScanReverse64) /* For Visual 2005 */ # else /* 32-bits */ # pragma intrinsic(_BitScanForward) /* For Visual 2005 */ # pragma intrinsic(_BitScanReverse) /* For Visual 2005 */ # endif # pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ #else # ifdef __GNUC__ # define FORCE_INLINE static inline __attribute__((always_inline)) # else # define FORCE_INLINE static inline # endif #endif #ifdef _MSC_VER /* Visual Studio */ # define lz4_bswap16(x) _byteswap_ushort(x) #else # define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8))) #endif #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) #if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) # define expect(expr,value) (__builtin_expect ((expr),(value)) ) #else # define expect(expr,value) (expr) #endif #define likely(expr) expect((expr) != 0, 1) #define unlikely(expr) expect((expr) != 0, 0) /************************************** Memory routines **************************************/ #include /* malloc, calloc, free */ #define ALLOCATOR(n,s) calloc(n,s) #define FREEMEM free #include /* memset, memcpy */ #define MEM_INIT memset /************************************** Includes **************************************/ #include "algorithm/lz4.h" /************************************** Basic Types **************************************/ #if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */ # include typedef uint8_t BYTE; typedef uint16_t U16; typedef uint32_t U32; typedef int32_t S32; typedef uint64_t U64; #else typedef unsigned char BYTE; typedef unsigned short U16; typedef unsigned int U32; typedef signed int S32; typedef unsigned long long U64; #endif #if defined(__GNUC__) && !defined(LZ4_FORCE_UNALIGNED_ACCESS) # define _PACKED __attribute__ ((packed)) #else # define _PACKED #endif #if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) # if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) # pragma pack(1) # else # pragma pack(push, 1) # endif #endif typedef struct { U16 v; } _PACKED U16_S; typedef struct { U32 v; } _PACKED U32_S; typedef struct { U64 v; } _PACKED U64_S; typedef struct {size_t v;} _PACKED size_t_S; #if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__) # if defined(__SUNPRO_C) || defined(__SUNPRO_CC) # pragma pack(0) # else # pragma pack(pop) # endif #endif #define A16(x) (((U16_S *)(x))->v) #define A32(x) (((U32_S *)(x))->v) #define A64(x) (((U64_S *)(x))->v) #define AARCH(x) (((size_t_S *)(x))->v) /************************************** Constants **************************************/ #define LZ4_HASHLOG (LZ4_MEMORY_USAGE-2) #define HASHTABLESIZE (1 << LZ4_MEMORY_USAGE) #define HASH_SIZE_U32 (1 << LZ4_HASHLOG) #define MINMATCH 4 #define COPYLENGTH 8 #define LASTLITERALS 5 #define MFLIMIT (COPYLENGTH+MINMATCH) static const int LZ4_minLength = (MFLIMIT+1); #define KB *(1U<<10) #define MB *(1U<<20) #define GB *(1U<<30) #define LZ4_64KLIMIT ((64 KB) + (MFLIMIT-1)) #define SKIPSTRENGTH 6 /* Increasing this value will make the compression run slower on incompressible data */ #define MAXD_LOG 16 #define MAX_DISTANCE ((1 << MAXD_LOG) - 1) #define ML_BITS 4 #define ML_MASK ((1U<=e; */ #else # define LZ4_WILDCOPY(d,s,e) { if (likely(e-d <= 8)) LZ4_COPY8(d,s) else do { LZ4_COPY8(d,s) } while (d>3); # elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_clzll(val) >> 3); # else int r; if (!(val>>32)) { r=4; } else { r=0; val>>=32; } if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } r += (!val); return r; # endif # else # if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanForward64( &r, val ); return (int)(r>>3); # elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctzll(val) >> 3); # else static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 }; return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; # endif # endif } #else static int LZ4_NbCommonBytes (register U32 val) { # if defined(LZ4_BIG_ENDIAN) # if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r = 0; _BitScanReverse( &r, val ); return (int)(r>>3); # elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_clz(val) >> 3); # else int r; if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } r += (!val); return r; # endif # else # if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT) unsigned long r; _BitScanForward( &r, val ); return (int)(r>>3); # elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT) return (__builtin_ctz(val) >> 3); # else static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 }; return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; # endif # endif } #endif /******************************** Compression functions ********************************/ int LZ4_versionNumber (void) { return LZ4_VERSION_NUMBER; } int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); } static int LZ4_hashSequence(U32 sequence, tableType_t tableType) { if (tableType == byU16) return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1))); else return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG)); } static int LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(A32(p), tableType); } static void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) { switch (tableType) { case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; break; } case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); break; } case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); break; } } } static void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) { U32 h = LZ4_hashPosition(p, tableType); LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase); } static const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase) { if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; } if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; } { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } /* default, to ensure a return */ } static const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase) { U32 h = LZ4_hashPosition(p, tableType); return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase); } static unsigned LZ4_count(const BYTE* pIn, const BYTE* pRef, const BYTE* pInLimit) { const BYTE* const pStart = pIn; while (likely(pIndictSize; const BYTE* const dictionary = dictPtr->dictionary; const BYTE* const dictEnd = dictionary + dictPtr->dictSize; const size_t dictDelta = dictEnd - (const BYTE*)source; const BYTE* anchor = (const BYTE*) source; const BYTE* const iend = ip + inputSize; const BYTE* const mflimit = iend - MFLIMIT; const BYTE* const matchlimit = iend - LASTLITERALS; BYTE* op = (BYTE*) dest; BYTE* const olimit = op + maxOutputSize; const int skipStrength = SKIPSTRENGTH; U32 forwardH; size_t refDelta=0; /* Init conditions */ if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; /* Unsupported input size, too large (or negative) */ switch(dict) { case noDict: default: base = (const BYTE*)source; lowLimit = (const BYTE*)source; break; case withPrefix64k: base = (const BYTE*)source - dictPtr->currentOffset; lowLimit = (const BYTE*)source - dictPtr->dictSize; break; case usingExtDict: base = (const BYTE*)source - dictPtr->currentOffset; lowLimit = (const BYTE*)source; break; } if ((tableType == byU16) && (inputSize>=(int)LZ4_64KLIMIT)) return 0; /* Size too large (not within 64K limit) */ if (inputSize> skipStrength; if (unlikely(forwardIp > mflimit)) goto _last_literals; ref = LZ4_getPositionOnHash(h, ctx, tableType, base); if (dict==usingExtDict) { if (ref<(const BYTE*)source) { refDelta = dictDelta; lowLimit = dictionary; } else { refDelta = 0; lowLimit = (const BYTE*)source; } } forwardH = LZ4_hashPosition(forwardIp, tableType); LZ4_putPositionOnHash(ip, h, ctx, tableType, base); } while ( ((dictIssue==dictSmall) ? (ref < lowRefLimit) : 0) || ((tableType==byU16) ? 0 : (ref + MAX_DISTANCE < ip)) || (A32(ref+refDelta) != A32(ip)) ); } /* Catch up */ while ((ip>anchor) && (ref+refDelta > lowLimit) && (unlikely(ip[-1]==ref[refDelta-1]))) { ip--; ref--; } { /* Encode Literal length */ unsigned litLength = (unsigned)(ip - anchor); token = op++; if ((outputLimited) && (unlikely(op + litLength + (2 + 1 + LASTLITERALS) + (litLength/255) > olimit))) return 0; /* Check output limit */ if (litLength>=RUN_MASK) { int len = (int)litLength-RUN_MASK; *token=(RUN_MASK<= 255 ; len-=255) *op++ = 255; *op++ = (BYTE)len; } else *token = (BYTE)(litLength< matchlimit) limit = matchlimit; matchLength = LZ4_count(ip+MINMATCH, ref+MINMATCH, limit); ip += MINMATCH + matchLength; if (ip==limit) { unsigned more = LZ4_count(ip, (const BYTE*)source, matchlimit); matchLength += more; ip += more; } } else { matchLength = LZ4_count(ip+MINMATCH, ref+MINMATCH, matchlimit); ip += MINMATCH + matchLength; } if (matchLength>=ML_MASK) { if ((outputLimited) && (unlikely(op + (1 + LASTLITERALS) + (matchLength>>8) > olimit))) return 0; /* Check output limit */ *token += ML_MASK; matchLength -= ML_MASK; for (; matchLength >= 510 ; matchLength-=510) { *op++ = 255; *op++ = 255; } if (matchLength >= 255) { matchLength-=255; *op++ = 255; } *op++ = (BYTE)matchLength; } else *token += (BYTE)(matchLength); } anchor = ip; /* Test end of chunk */ if (ip > mflimit) break; /* Fill table */ LZ4_putPosition(ip-2, ctx, tableType, base); /* Test next position */ ref = LZ4_getPosition(ip, ctx, tableType, base); if (dict==usingExtDict) { if (ref<(const BYTE*)source) { refDelta = dictDelta; lowLimit = dictionary; } else { refDelta = 0; lowLimit = (const BYTE*)source; } } LZ4_putPosition(ip, ctx, tableType, base); if ( ((dictIssue==dictSmall) ? (ref>=lowRefLimit) : 1) && (ref+MAX_DISTANCE>=ip) && (A32(ref+refDelta)==A32(ip)) ) { token=op++; *token=0; goto _next_match; } /* Prepare next loop */ forwardH = LZ4_hashPosition(++ip, tableType); } _last_literals: /* Encode Last Literals */ { int lastRun = (int)(iend - anchor); if ((outputLimited) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; /* Check output limit */ if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<= 255 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; } else *op++ = (BYTE)(lastRun<= sizeof(LZ4_stream_t_internal)); /* A compilation error here means LZ4_STREAMSIZE is not large enough */ if (dict->initCheck) LZ4_resetStream(LZ4_dict); /* Uninitialized structure detected */ if (dictSize < MINMATCH) { dict->dictionary = NULL; dict->dictSize = 0; return 1; } if (p <= dictEnd - 64 KB) p = dictEnd - 64 KB; base = p - dict->currentOffset; dict->dictionary = p; dict->dictSize = (U32)(dictEnd - p); dict->currentOffset += dict->dictSize; while (p <= dictEnd-MINMATCH) { LZ4_putPosition(p, dict, byU32, base); p+=3; } return 1; } static void LZ4_renormDictT(LZ4_stream_t_internal* LZ4_dict, const BYTE* src) { if ((LZ4_dict->currentOffset > 0x80000000) || ((size_t)LZ4_dict->currentOffset > (size_t)src)) /* address space overflow */ { /* rescale hash table */ U32 delta = LZ4_dict->currentOffset - 64 KB; const BYTE* dictEnd = LZ4_dict->dictionary + LZ4_dict->dictSize; int i; for (i=0; ihashTable[i] < delta) LZ4_dict->hashTable[i]=0; else LZ4_dict->hashTable[i] -= delta; } LZ4_dict->currentOffset = 64 KB; if (LZ4_dict->dictSize > 64 KB) LZ4_dict->dictSize = 64 KB; LZ4_dict->dictionary = dictEnd - LZ4_dict->dictSize; } } FORCE_INLINE int LZ4_compress_continue_generic (void* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize, limitedOutput_directive limit) { LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_stream; const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; const BYTE* smallest = (const BYTE*) source; if (streamPtr->initCheck) return 0; /* Uninitialized structure detected */ if ((streamPtr->dictSize>0) && (smallest>dictEnd)) smallest = dictEnd; LZ4_renormDictT(streamPtr, smallest); /* Check overlapping input/dictionary space */ { const BYTE* sourceEnd = (const BYTE*) source + inputSize; if ((sourceEnd > streamPtr->dictionary) && (sourceEnd < dictEnd)) { streamPtr->dictSize = (U32)(dictEnd - sourceEnd); if (streamPtr->dictSize > 64 KB) streamPtr->dictSize = 64 KB; if (streamPtr->dictSize < 4) streamPtr->dictSize = 0; streamPtr->dictionary = dictEnd - streamPtr->dictSize; } } /* prefix mode : source data follows dictionary */ if (dictEnd == (const BYTE*)source) { int result; if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, withPrefix64k, dictSmall); else result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, withPrefix64k, noDictIssue); streamPtr->dictSize += (U32)inputSize; streamPtr->currentOffset += (U32)inputSize; return result; } /* external dictionary mode */ { int result; if ((streamPtr->dictSize < 64 KB) && (streamPtr->dictSize < streamPtr->currentOffset)) result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, usingExtDict, dictSmall); else result = LZ4_compress_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limit, byU32, usingExtDict, noDictIssue); streamPtr->dictionary = (const BYTE*)source; streamPtr->dictSize = (U32)inputSize; streamPtr->currentOffset += (U32)inputSize; return result; } } int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize) { return LZ4_compress_continue_generic(LZ4_stream, source, dest, inputSize, 0, notLimited); } int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize) { return LZ4_compress_continue_generic(LZ4_stream, source, dest, inputSize, maxOutputSize, limitedOutput); } /* Hidden debug function, to force separate dictionary mode */ int LZ4_compress_forceExtDict (LZ4_stream_t* LZ4_dict, const char* source, char* dest, int inputSize) { LZ4_stream_t_internal* streamPtr = (LZ4_stream_t_internal*)LZ4_dict; int result; const BYTE* const dictEnd = streamPtr->dictionary + streamPtr->dictSize; const BYTE* smallest = dictEnd; if (smallest > (const BYTE*) source) smallest = (const BYTE*) source; LZ4_renormDictT((LZ4_stream_t_internal*)LZ4_dict, smallest); result = LZ4_compress_generic(LZ4_dict, source, dest, inputSize, 0, notLimited, byU32, usingExtDict, noDictIssue); streamPtr->dictionary = (const BYTE*)source; streamPtr->dictSize = (U32)inputSize; streamPtr->currentOffset += (U32)inputSize; return result; } int LZ4_saveDict (LZ4_stream_t* LZ4_dict, char* safeBuffer, int dictSize) { LZ4_stream_t_internal* dict = (LZ4_stream_t_internal*) LZ4_dict; const BYTE* previousDictEnd = dict->dictionary + dict->dictSize; if ((U32)dictSize > 64 KB) dictSize = 64 KB; /* useless to define a dictionary > 64 KB */ if ((U32)dictSize > dict->dictSize) dictSize = dict->dictSize; memcpy(safeBuffer, previousDictEnd - dictSize, dictSize); dict->dictionary = (const BYTE*)safeBuffer; dict->dictSize = (U32)dictSize; return 1; } /**************************** Decompression functions ****************************/ /* * This generic decompression function cover all use cases. * It shall be instanciated several times, using different sets of directives * Note that it is essential this generic function is really inlined, * in order to remove useless branches during compilation optimisation. */ FORCE_INLINE int LZ4_decompress_generic( const char* source, char* dest, int inputSize, int outputSize, /* If endOnInput==endOnInputSize, this value is the max size of Output Buffer. */ int endOnInput, /* endOnOutputSize, endOnInputSize */ int partialDecoding, /* full, partial */ int targetOutputSize, /* only used if partialDecoding==partial */ int dict, /* noDict, withPrefix64k, usingExtDict */ const char* dictStart, /* only if dict==usingExtDict */ int dictSize /* note : = 0 if noDict */ ) { /* Local Variables */ const BYTE* restrict ip = (const BYTE*) source; const BYTE* ref; const BYTE* const iend = ip + inputSize; BYTE* op = (BYTE*) dest; BYTE* const oend = op + outputSize; BYTE* cpy; BYTE* oexit = op + targetOutputSize; const BYTE* const lowLimit = (const BYTE*)dest - dictSize; const BYTE* const dictEnd = (const BYTE*)dictStart + dictSize; const size_t dec32table[] = {4-0, 4-3, 4-2, 4-3, 4-0, 4-0, 4-0, 4-0}; /* note : static reduces speed for LZ4_decompress_safe() on GCC64 */ static const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3}; const int safeDecode = (endOnInput==endOnInputSize); const int checkOffset = ((safeDecode) && (dictSize < (int)(64 KB))); /* Special cases */ if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; /* targetOutputSize too high => decode everything */ if ((endOnInput) && (unlikely(outputSize==0))) return ((inputSize==1) && (*ip==0)) ? 0 : -1; /* Empty output buffer */ if ((!endOnInput) && (unlikely(outputSize==0))) return (*ip==0?1:-1); /* Main Loop */ while (1) { unsigned token; size_t length; /* get runlength */ token = *ip++; if ((length=(token>>ML_BITS)) == RUN_MASK) { unsigned s; do { s = *ip++; length += s; } while (likely((endOnInput)?ip(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) ) || ((!endOnInput) && (cpy>oend-COPYLENGTH))) { if (partialDecoding) { if (cpy > oend) goto _output_error; /* Error : write attempt beyond end of output buffer */ if ((endOnInput) && (ip+length > iend)) goto _output_error; /* Error : read attempt beyond end of input buffer */ } else { if ((!endOnInput) && (cpy != oend)) goto _output_error; /* Error : block decoding must stop exactly there */ if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; /* Error : input must be consumed */ } memcpy(op, ip, length); ip += length; op += length; break; /* Necessarily EOF, due to parsing restrictions */ } LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy; /* get offset */ LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2; if ((checkOffset) && (unlikely(ref < lowLimit))) goto _output_error; /* Error : offset outside destination buffer */ /* get matchlength */ if ((length=(token&ML_MASK)) == ML_MASK) { unsigned s; do { if ((endOnInput) && (ip > iend-LASTLITERALS)) goto _output_error; s = *ip++; length += s; } while (s==255); if ((safeDecode) && LZ4_32BITS && unlikely((size_t)(op+length)<(size_t)op)) goto _output_error; /* overflow detection */ } /* check external dictionary */ if ((dict==usingExtDict) && (ref < (BYTE* const)dest)) { if (unlikely(op+length+MINMATCH > oend-LASTLITERALS)) goto _output_error; if (length+MINMATCH <= (size_t)(dest-(char*)ref)) { ref = dictEnd - (dest-(char*)ref); memcpy(op, ref, length+MINMATCH); op += length+MINMATCH; } else { size_t copySize = (size_t)(dest-(char*)ref); memcpy(op, dictEnd - copySize, copySize); op += copySize; copySize = length+MINMATCH - copySize; if (copySize > (size_t)((char*)op-dest)) /* overlap */ { BYTE* const endOfMatch = op + copySize; const BYTE* copyFrom = (BYTE*)dest; while (op < endOfMatch) *op++ = *copyFrom++; } else { memcpy(op, dest, copySize); op += copySize; } } continue; } /* copy repeated sequence */ if (unlikely((op-ref)<(int)STEPSIZE)) { const size_t dec64 = dec64table[LZ4_32BITS ? 0 : op-ref]; op[0] = ref[0]; op[1] = ref[1]; op[2] = ref[2]; op[3] = ref[3]; ref += dec32table[op-ref]; A32(op+4) = A32(ref); op += STEPSIZE; ref -= dec64; } else { LZ4_COPYSTEP(op,ref); } cpy = op + length - (STEPSIZE-4); if (unlikely(cpy>oend-COPYLENGTH-(STEPSIZE-4))) { if (cpy > oend-LASTLITERALS) goto _output_error; /* Error : last 5 bytes must be literals */ if (opdictionary = dictionary; lz4sd->dictSize = dictSize; return 1; } /* *_continue() : These decoding functions allow decompression of multiple blocks in "streaming" mode. Previously decoded blocks must still be available at the memory position where they were decoded. If it's not possible, save the relevant part of decoded data into a safe buffer, and indicate where it stands using LZ4_setDictDecode() */ int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize) { LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; int result; result = LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, lz4sd->dictionary, lz4sd->dictSize); if (result <= 0) return result; if (lz4sd->dictionary + lz4sd->dictSize == dest) { lz4sd->dictSize += result; } else { lz4sd->dictionary = dest; lz4sd->dictSize = result; } return result; } int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize) { LZ4_streamDecode_t_internal* lz4sd = (LZ4_streamDecode_t_internal*) LZ4_streamDecode; int result; result = LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, usingExtDict, lz4sd->dictionary, lz4sd->dictSize); if (result <= 0) return result; if (lz4sd->dictionary + lz4sd->dictSize == dest) { lz4sd->dictSize += result; } else { lz4sd->dictionary = dest; lz4sd->dictSize = result; } return result; } /* Advanced decoding functions : *_usingDict() : These decoding functions work the same as "_continue" ones, the dictionary must be explicitly provided within parameters */ int LZ4_decompress_safe_usingDict(const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize) { return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, usingExtDict, dictStart, dictSize); } int LZ4_decompress_fast_usingDict(const char* source, char* dest, int originalSize, const char* dictStart, int dictSize) { return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, usingExtDict, dictStart, dictSize); } /*************************************************** Obsolete Functions ***************************************************/ /* These function names are deprecated and should no longer be used. They are only provided here for compatibility with older user programs. - LZ4_uncompress is totally equivalent to LZ4_decompress_fast - LZ4_uncompress_unknownOutputSize is totally equivalent to LZ4_decompress_safe */ int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); } int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); } /* Obsolete Streaming functions */ int LZ4_sizeofStreamState() { return LZ4_STREAMSIZE; } static void LZ4_init(LZ4_stream_t_internal* lz4ds, const BYTE* base) { MEM_INIT(lz4ds, 0, LZ4_STREAMSIZE); lz4ds->bufferStart = base; } int LZ4_resetStreamState(void* state, const char* inputBuffer) { if ((((size_t)state) & 3) != 0) return 1; /* Error : pointer is not aligned on 4-bytes boundary */ LZ4_init((LZ4_stream_t_internal*)state, (const BYTE*)inputBuffer); return 0; } void* LZ4_create (const char* inputBuffer) { void* lz4ds = ALLOCATOR(4, LZ4_STREAMSIZE_U32); LZ4_init ((LZ4_stream_t_internal*)lz4ds, (const BYTE*)inputBuffer); return lz4ds; } char* LZ4_slideInputBuffer (void* LZ4_Data) { LZ4_stream_t_internal* lz4ds = (LZ4_stream_t_internal*)LZ4_Data; LZ4_saveDict((LZ4_stream_t*)LZ4_Data, (char*)lz4ds->bufferStart, 64 KB); return (char*)(lz4ds->bufferStart + 64 KB); } /* Obsolete compresson functions using User-allocated state */ int LZ4_sizeofState() { return LZ4_STREAMSIZE; } int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize) { if (((size_t)(state)&3) != 0) return 0; /* Error : state is not aligned on 4-bytes boundary */ MEM_INIT(state, 0, LZ4_STREAMSIZE); if (inputSize < (int)LZ4_64KLIMIT) return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, byU16, noDict, noDictIssue); else return LZ4_compress_generic(state, source, dest, inputSize, 0, notLimited, LZ4_64BITS ? byU32 : byPtr, noDict, noDictIssue); } int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize) { if (((size_t)(state)&3) != 0) return 0; /* Error : state is not aligned on 4-bytes boundary */ MEM_INIT(state, 0, LZ4_STREAMSIZE); if (inputSize < (int)LZ4_64KLIMIT) return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, byU16, noDict, noDictIssue); else return LZ4_compress_generic(state, source, dest, inputSize, maxOutputSize, limitedOutput, LZ4_64BITS ? byU32 : byPtr, noDict, noDictIssue); } /* Obsolete streaming decompression functions */ int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int compressedSize, int maxOutputSize) { return LZ4_decompress_generic(source, dest, compressedSize, maxOutputSize, endOnInputSize, full, 0, withPrefix64k, NULL, 64 KB); } int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int originalSize) { return LZ4_decompress_generic(source, dest, 0, originalSize, endOnOutputSize, full, 0, withPrefix64k, NULL, 64 KB); } ================================================ FILE: algorithm/lz4.h ================================================ /* LZ4 - Fast LZ compression algorithm Header File Copyright (C) 2011-2014, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - LZ4 source repository : http://code.google.com/p/lz4/ - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c */ #pragma once #if defined (__cplusplus) extern "C" { #endif /************************************** Version **************************************/ #define LZ4_VERSION_MAJOR 1 /* for major interface/format changes */ #define LZ4_VERSION_MINOR 3 /* for minor interface/format changes */ #define LZ4_VERSION_RELEASE 0 /* for tweaks, bug-fixes, or development */ #define LZ4_VERSION_NUMBER (LZ4_VERSION_MAJOR *100*100 + LZ4_VERSION_MINOR *100 + LZ4_VERSION_RELEASE) int LZ4_versionNumber (void); /************************************** Tuning parameter **************************************/ /* * LZ4_MEMORY_USAGE : * Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) * Increasing memory usage improves compression ratio * Reduced memory usage can improve speed, due to cache effect * Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */ #define LZ4_MEMORY_USAGE 14 /************************************** Simple Functions **************************************/ int LZ4_compress (const char* source, char* dest, int inputSize); int LZ4_decompress_safe (const char* source, char* dest, int compressedSize, int maxDecompressedSize); /* LZ4_compress() : Compresses 'inputSize' bytes from 'source' into 'dest'. Destination buffer must be already allocated, and must be sized to handle worst cases situations (input data not compressible) Worst case size evaluation is provided by function LZ4_compressBound() inputSize : Max supported value is LZ4_MAX_INPUT_SIZE return : the number of bytes written in buffer dest or 0 if the compression fails LZ4_decompress_safe() : compressedSize : is obviously the source size maxDecompressedSize : is the size of the destination buffer, which must be already allocated. return : the number of bytes decompressed into the destination buffer (necessarily <= maxDecompressedSize) If the destination buffer is not large enough, decoding will stop and output an error code (<0). If the source stream is detected malformed, the function will stop decoding and return a negative result. This function is protected against buffer overflow exploits : it never writes outside of output buffer, and never reads outside of input buffer. Therefore, it is protected against malicious data packets. */ /* Note : Should you prefer to explicitly allocate compression-table memory using your own allocation method, use the streaming functions provided below, simply reset the memory area between each call to LZ4_compress_continue() */ /************************************** Advanced Functions **************************************/ #define LZ4_MAX_INPUT_SIZE 0x7E000000 /* 2 113 929 216 bytes */ #define LZ4_COMPRESSBOUND(isize) ((unsigned int)(isize) > (unsigned int)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16) /* LZ4_compressBound() : Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible) primarily useful for memory allocation of output buffer. macro is also provided when result needs to be evaluated at compilation (such as stack memory allocation). isize : is the input size. Max supported value is LZ4_MAX_INPUT_SIZE return : maximum output size in a "worst case" scenario or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE) */ int LZ4_compressBound(int isize); /* LZ4_compress_limitedOutput() : Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'. If it cannot achieve it, compression will stop, and result of the function will be zero. This function never writes outside of provided output buffer. inputSize : Max supported value is LZ4_MAX_INPUT_VALUE maxOutputSize : is the size of the destination buffer (which must be already allocated) return : the number of bytes written in buffer 'dest' or 0 if the compression fails */ int LZ4_compress_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize); /* LZ4_compress_withState() : Same compression functions, but using an externally allocated memory space to store compression state. Use LZ4_sizeofState() to know how much memory must be allocated, and then, provide it as 'void* state' to compression functions. */ int LZ4_sizeofState(void); int LZ4_compress_withState (void* state, const char* source, char* dest, int inputSize); int LZ4_compress_limitedOutput_withState (void* state, const char* source, char* dest, int inputSize, int maxOutputSize); /* LZ4_decompress_fast() : originalSize : is the original and therefore uncompressed size return : the number of bytes read from the source buffer (in other words, the compressed size) If the source stream is detected malformed, the function will stop decoding and return a negative result. Destination buffer must be already allocated. Its size must be a minimum of 'originalSize' bytes. note : This function fully respect memory boundaries for properly formed compressed data. It is a bit faster than LZ4_decompress_safe(). However, it does not provide any protection against intentionnally modified data stream (malicious input). Use this function in trusted environment only (data to decode comes from a trusted source). */ int LZ4_decompress_fast (const char* source, char* dest, int originalSize); /* LZ4_decompress_safe_partial() : This function decompress a compressed block of size 'compressedSize' at position 'source' into destination buffer 'dest' of size 'maxDecompressedSize'. The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached, reducing decompression time. return : the number of bytes decoded in the destination buffer (necessarily <= maxDecompressedSize) Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller. Always control how many bytes were decoded. If the source stream is detected malformed, the function will stop decoding and return a negative result. This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets */ int LZ4_decompress_safe_partial (const char* source, char* dest, int compressedSize, int targetOutputSize, int maxDecompressedSize); /*********************************************** Experimental Streaming Compression Functions ***********************************************/ #define LZ4_STREAMSIZE_U32 ((1 << (LZ4_MEMORY_USAGE-2)) + 8) #define LZ4_STREAMSIZE (LZ4_STREAMSIZE_U32 * sizeof(unsigned int)) /* * LZ4_stream_t * information structure to track an LZ4 stream. * important : init this structure content before first use ! */ typedef struct { unsigned int table[LZ4_STREAMSIZE_U32]; } LZ4_stream_t; /* * LZ4_resetStream * Use this function to init a newly allocated LZ4_stream_t structure * You can also reset an existing LZ4_stream_t structure */ void LZ4_resetStream (LZ4_stream_t* LZ4_stream); /* * If you prefer dynamic allocation methods, * LZ4_createStream will allocate and initialize an LZ4_stream_t structure * LZ4_freeStream releases its memory. */ LZ4_stream_t* LZ4_createStream(void); int LZ4_freeStream (LZ4_stream_t* LZ4_stream); /* * LZ4_loadDict * Use this function to load a static dictionary into LZ4_stream. * Any previous data will be forgotten, only 'dictionary' will remain in memory. * Loading a size of 0 is allowed. * Return : 1 if OK, 0 if error */ int LZ4_loadDict (LZ4_stream_t* LZ4_stream, const char* dictionary, int dictSize); /* * LZ4_compress_continue * Compress data block 'source', using blocks compressed before as dictionary to improve compression ratio * Previous data blocks are assumed to still be present at their previous location. */ int LZ4_compress_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize); /* * LZ4_compress_limitedOutput_continue * Same as before, but also specify a maximum target compressed size (maxOutputSize) * If objective cannot be met, compression exits, and returns a zero. */ int LZ4_compress_limitedOutput_continue (LZ4_stream_t* LZ4_stream, const char* source, char* dest, int inputSize, int maxOutputSize); /* * LZ4_saveDict * If previously compressed data block is not guaranteed to remain available at its memory location * save it into a safe place (char* safeBuffer) * Note : you don't need to call LZ4_loadDict() afterwards, * dictionary is immediately usable, you can therefore call again LZ4_compress_continue() * Return : 1 if OK, 0 if error * Note : any dictSize > 64 KB will be interpreted as 64KB. */ int LZ4_saveDict (LZ4_stream_t* LZ4_stream, char* safeBuffer, int dictSize); /************************************************ Experimental Streaming Decompression Functions ************************************************/ #define LZ4_STREAMDECODESIZE_U32 4 #define LZ4_STREAMDECODESIZE (LZ4_STREAMDECODESIZE_U32 * sizeof(unsigned int)) /* * LZ4_streamDecode_t * information structure to track an LZ4 stream. * important : init this structure content using LZ4_setStreamDecode or memset() before first use ! */ typedef struct { unsigned int table[LZ4_STREAMDECODESIZE_U32]; } LZ4_streamDecode_t; /* * LZ4_setStreamDecode * Use this function to instruct where to find the dictionary. * This function can be used to specify a static dictionary, * or to instruct where to find some previously decoded data saved into a different memory space. * Setting a size of 0 is allowed (same effect as no dictionary). * Return : 1 if OK, 0 if error */ int LZ4_setStreamDecode (LZ4_streamDecode_t* LZ4_streamDecode, const char* dictionary, int dictSize); /* * If you prefer dynamic allocation methods, * LZ4_createStreamDecode will allocate and initialize an LZ4_streamDecode_t structure * LZ4_freeStreamDecode releases its memory. */ LZ4_streamDecode_t* LZ4_createStreamDecode(void); int LZ4_freeStreamDecode (LZ4_streamDecode_t* LZ4_stream); /* *_continue() : These decoding functions allow decompression of multiple blocks in "streaming" mode. Previously decoded blocks must still be available at the memory position where they were decoded. If it's not possible, save the relevant part of decoded data into a safe buffer, and indicate where its new address using LZ4_setDictDecode() */ int LZ4_decompress_safe_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int compressedSize, int maxOutputSize); int LZ4_decompress_fast_continue (LZ4_streamDecode_t* LZ4_streamDecode, const char* source, char* dest, int originalSize); /* Advanced decoding functions : *_usingDict() : These decoding functions work the same as a combination of LZ4_setDictDecode() followed by LZ4_decompress_x_continue() all together into a single function call. It doesn't use nor update an LZ4_streamDecode_t structure. */ int LZ4_decompress_safe_usingDict (const char* source, char* dest, int compressedSize, int maxOutputSize, const char* dictStart, int dictSize); int LZ4_decompress_fast_usingDict (const char* source, char* dest, int originalSize, const char* dictStart, int dictSize); /************************************** Obsolete Functions **************************************/ /* Obsolete decompression functions These function names are deprecated and should no longer be used. They are only provided here for compatibility with older user programs. - LZ4_uncompress is the same as LZ4_decompress_fast - LZ4_uncompress_unknownOutputSize is the same as LZ4_decompress_safe These function prototypes are now disabled; uncomment them if you really need them. It is highly recommended to stop using these functions and migrated to newer ones */ /* int LZ4_uncompress (const char* source, char* dest, int outputSize); */ /* int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize); */ /* * If you prefer dynamic allocation methods, * LZ4_createStreamDecode() * provides a pointer (void*) towards an initialized LZ4_streamDecode_t structure. * LZ4_free just frees it. */ /* void* LZ4_createStreamDecode(void); */ /*int LZ4_free (void* LZ4_stream); yes, it's the same one as for compression */ /* Obsolete streaming functions; use new streaming interface whenever possible */ void* LZ4_create (const char* inputBuffer); int LZ4_sizeofStreamState(void); int LZ4_resetStreamState(void* state, const char* inputBuffer); char* LZ4_slideInputBuffer (void* state); /* Obsolete streaming decoding functions */ int LZ4_decompress_safe_withPrefix64k (const char* source, char* dest, int compressedSize, int maxOutputSize); int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int originalSize); #if defined (__cplusplus) } #endif ================================================ FILE: algorithm/murmurhash3.cc ================================================ //----------------------------------------------------------------------------- // MurmurHash3 was written by Austin Appleby, and is placed in the public // domain. The author hereby disclaims copyright to this source code. // Note - The x86 and x64 versions do _not_ produce the same results, as the // algorithms are optimized for their respective platforms. You can still // compile and run any of them on any platform, but your performance with the // non-native version will be less than optimal. #include "algorithm/murmurhash3.h" //----------------------------------------------------------------------------- // Platform-specific functions and macros // Microsoft Visual Studio #if defined(_MSC_VER) #define FORCE_INLINE __forceinline #include #define ROTL32(x,y) _rotl(x,y) #define ROTL64(x,y) _rotl64(x,y) #define BIG_CONSTANT(x) (x) // Other compilers #else // defined(_MSC_VER) #define FORCE_INLINE inline __attribute__((always_inline)) inline uint32_t rotl32 ( uint32_t x, int8_t r ) { return (x << r) | (x >> (32 - r)); } inline uint64_t rotl64 ( uint64_t x, int8_t r ) { return (x << r) | (x >> (64 - r)); } #define ROTL32(x,y) rotl32(x,y) #define ROTL64(x,y) rotl64(x,y) #define BIG_CONSTANT(x) (x##LLU) #endif // !defined(_MSC_VER) //----------------------------------------------------------------------------- // Block read - if your platform needs to do endian-swapping or can only // handle aligned reads, do the conversion here FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) { return p[i]; } FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) { return p[i]; } //----------------------------------------------------------------------------- // Finalization mix - force all bits of a hash block to avalanche FORCE_INLINE uint32_t fmix32 ( uint32_t h ) { h ^= h >> 16; h *= 0x85ebca6b; h ^= h >> 13; h *= 0xc2b2ae35; h ^= h >> 16; return h; } //---------- FORCE_INLINE uint64_t fmix64 ( uint64_t k ) { k ^= k >> 33; k *= BIG_CONSTANT(0xff51afd7ed558ccd); k ^= k >> 33; k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); k ^= k >> 33; return k; } //----------------------------------------------------------------------------- void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ) { const uint8_t * data = (const uint8_t*)key; const int nblocks = len / 4; uint32_t h1 = seed; const uint32_t c1 = 0xcc9e2d51; const uint32_t c2 = 0x1b873593; //---------- // body const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); for(int i = -nblocks; i; i++) { uint32_t k1 = getblock32(blocks,i); k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; h1 = ROTL32(h1,13); h1 = h1*5+0xe6546b64; } //---------- // tail const uint8_t * tail = (const uint8_t*)(data + nblocks*4); uint32_t k1 = 0; switch(len & 3) { case 3: k1 ^= tail[2] << 16; case 2: k1 ^= tail[1] << 8; case 1: k1 ^= tail[0]; k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; }; //---------- // finalization h1 ^= len; h1 = fmix32(h1); *(uint32_t*)out = h1; } //----------------------------------------------------------------------------- void MurmurHash3_x86_128 ( const void * key, const int len, uint32_t seed, void * out ) { const uint8_t * data = (const uint8_t*)key; const int nblocks = len / 16; uint32_t h1 = seed; uint32_t h2 = seed; uint32_t h3 = seed; uint32_t h4 = seed; const uint32_t c1 = 0x239b961b; const uint32_t c2 = 0xab0e9789; const uint32_t c3 = 0x38b34ae5; const uint32_t c4 = 0xa1e38b93; //---------- // body const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); for(int i = -nblocks; i; i++) { uint32_t k1 = getblock32(blocks,i*4+0); uint32_t k2 = getblock32(blocks,i*4+1); uint32_t k3 = getblock32(blocks,i*4+2); uint32_t k4 = getblock32(blocks,i*4+3); k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; } //---------- // tail const uint8_t * tail = (const uint8_t*)(data + nblocks*16); uint32_t k1 = 0; uint32_t k2 = 0; uint32_t k3 = 0; uint32_t k4 = 0; switch(len & 15) { case 15: k4 ^= tail[14] << 16; case 14: k4 ^= tail[13] << 8; case 13: k4 ^= tail[12] << 0; k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; case 12: k3 ^= tail[11] << 24; case 11: k3 ^= tail[10] << 16; case 10: k3 ^= tail[ 9] << 8; case 9: k3 ^= tail[ 8] << 0; k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; case 8: k2 ^= tail[ 7] << 24; case 7: k2 ^= tail[ 6] << 16; case 6: k2 ^= tail[ 5] << 8; case 5: k2 ^= tail[ 4] << 0; k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; case 4: k1 ^= tail[ 3] << 24; case 3: k1 ^= tail[ 2] << 16; case 2: k1 ^= tail[ 1] << 8; case 1: k1 ^= tail[ 0] << 0; k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; }; //---------- // finalization h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1; h1 = fmix32(h1); h2 = fmix32(h2); h3 = fmix32(h3); h4 = fmix32(h4); h1 += h2; h1 += h3; h1 += h4; h2 += h1; h3 += h1; h4 += h1; ((uint32_t*)out)[0] = h1; ((uint32_t*)out)[1] = h2; ((uint32_t*)out)[2] = h3; ((uint32_t*)out)[3] = h4; } //----------------------------------------------------------------------------- void MurmurHash3_x64_128 ( const void * key, const int len, const uint32_t seed, void * out ) { const uint8_t * data = (const uint8_t*)key; const int nblocks = len / 16; uint64_t h1 = seed; uint64_t h2 = seed; const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); //---------- // body const uint64_t * blocks = (const uint64_t *)(data); for(int i = 0; i < nblocks; i++) { uint64_t k1 = getblock64(blocks,i*2+0); uint64_t k2 = getblock64(blocks,i*2+1); k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; } //---------- // tail const uint8_t * tail = (const uint8_t*)(data + nblocks*16); uint64_t k1 = 0; uint64_t k2 = 0; switch(len & 15) { case 15: k2 ^= ((uint64_t)tail[14]) << 48; case 14: k2 ^= ((uint64_t)tail[13]) << 40; case 13: k2 ^= ((uint64_t)tail[12]) << 32; case 12: k2 ^= ((uint64_t)tail[11]) << 24; case 11: k2 ^= ((uint64_t)tail[10]) << 16; case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; }; //---------- // finalization h1 ^= len; h2 ^= len; h1 += h2; h2 += h1; h1 = fmix64(h1); h2 = fmix64(h2); h1 += h2; h2 += h1; ((uint64_t*)out)[0] = h1; ((uint64_t*)out)[1] = h2; } //----------------------------------------------------------------------------- ================================================ FILE: algorithm/murmurhash3.h ================================================ //----------------------------------------------------------------------------- // MurmurHash3 was written by Austin Appleby, and is placed in the public // domain. The author hereby disclaims copyright to this source code. #ifndef _MURMURHASH3_H_ #define _MURMURHASH3_H_ //----------------------------------------------------------------------------- // Platform-specific functions and macros // Microsoft Visual Studio #if defined(_MSC_VER) typedef unsigned char uint8_t; typedef unsigned long uint32_t; typedef unsigned __int64 uint64_t; // Other compilers #else // defined(_MSC_VER) #include #endif // !defined(_MSC_VER) //----------------------------------------------------------------------------- void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); //----------------------------------------------------------------------------- #endif // _MURMURHASH3_H_ ================================================ FILE: algorithm/xxhash.cc ================================================ /* xxHash - Fast Hash algorithm Copyright (C) 2012-2014, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - xxHash source repository : http://code.google.com/p/xxhash/ */ //************************************** // Tuning parameters //************************************** // Unaligned memory access is automatically enabled for "common" CPU, such as x86. // For others CPU, the compiler will be more cautious, and insert extra code to ensure aligned access is respected. // If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance. // You can also enable this parameter if you know your input data will always be aligned (boundaries of 4, for U32). #if defined(__ARM_FEATURE_UNALIGNED) || defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) # define XXH_USE_UNALIGNED_ACCESS 1 #endif // XXH_ACCEPT_NULL_INPUT_POINTER : // If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. // When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. // This option has a very small performance cost (only measurable on small inputs). // By default, this option is disabled. To enable it, uncomment below define : // #define XXH_ACCEPT_NULL_INPUT_POINTER 1 // XXH_FORCE_NATIVE_FORMAT : // By default, xxHash library provides endian-independant Hash values, based on little-endian convention. // Results are therefore identical for little-endian and big-endian CPU. // This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. // Should endian-independance be of no importance for your application, you may set the #define below to 1. // It will improve speed for Big-endian CPU. // This option has no impact on Little_Endian CPU. #define XXH_FORCE_NATIVE_FORMAT 0 //************************************** // Compiler Specific Options //************************************** // Disable some Visual warning messages #ifdef _MSC_VER // Visual Studio # pragma warning(disable : 4127) // disable: C4127: conditional expression is constant #endif #ifdef _MSC_VER // Visual Studio # define FORCE_INLINE static __forceinline #else # ifdef __GNUC__ # define FORCE_INLINE static inline __attribute__((always_inline)) # else # define FORCE_INLINE static inline # endif #endif //************************************** // Includes & Memory related functions //************************************** #include "algorithm/xxhash.h" // Modify the local functions below should you wish to use some other memory related routines // for malloc(), free() #include FORCE_INLINE void* XXH_malloc(size_t s) { return malloc(s); } FORCE_INLINE void XXH_free (void* p) { free(p); } // for memcpy() #include FORCE_INLINE void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); } //************************************** // Basic Types //************************************** #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99 # include typedef uint8_t BYTE; typedef uint16_t U16; typedef uint32_t U32; typedef int32_t S32; typedef uint64_t U64; #else typedef unsigned char BYTE; typedef unsigned short U16; typedef unsigned int U32; typedef signed int S32; typedef unsigned long long U64; #endif #if defined(__GNUC__) && !defined(XXH_USE_UNALIGNED_ACCESS) # define _PACKED __attribute__ ((packed)) #else # define _PACKED #endif #if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) # ifdef __IBMC__ # pragma pack(1) # else # pragma pack(push, 1) # endif #endif typedef struct _U32_S { U32 v; } _PACKED U32_S; typedef struct _U64_S { U64 v; } _PACKED U64_S; #if !defined(XXH_USE_UNALIGNED_ACCESS) && !defined(__GNUC__) # pragma pack(pop) #endif #define A32(x) (((U32_S *)(x))->v) #define A64(x) (((U64_S *)(x))->v) //*************************************** // Compiler-specific Functions and Macros //*************************************** #define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) // Note : although _rotl exists for minGW (GCC under windows), performance seems poor #if defined(_MSC_VER) # define XXH_rotl32(x,r) _rotl(x,r) # define XXH_rotl64(x,r) _rotl64(x,r) #else # define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) # define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r))) #endif #if defined(_MSC_VER) // Visual Studio # define XXH_swap32 _byteswap_ulong # define XXH_swap64 _byteswap_uint64 #elif GCC_VERSION >= 403 # define XXH_swap32 __builtin_bswap32 # define XXH_swap64 __builtin_bswap64 #else static inline U32 XXH_swap32 (U32 x) { return ((x << 24) & 0xff000000 ) | ((x << 8) & 0x00ff0000 ) | ((x >> 8) & 0x0000ff00 ) | ((x >> 24) & 0x000000ff );} static inline U64 XXH_swap64 (U64 x) { return ((x << 56) & 0xff00000000000000ULL) | ((x << 40) & 0x00ff000000000000ULL) | ((x << 24) & 0x0000ff0000000000ULL) | ((x << 8) & 0x000000ff00000000ULL) | ((x >> 8) & 0x00000000ff000000ULL) | ((x >> 24) & 0x0000000000ff0000ULL) | ((x >> 40) & 0x000000000000ff00ULL) | ((x >> 56) & 0x00000000000000ffULL);} #endif //************************************** // Constants //************************************** #define PRIME32_1 2654435761U #define PRIME32_2 2246822519U #define PRIME32_3 3266489917U #define PRIME32_4 668265263U #define PRIME32_5 374761393U #define PRIME64_1 11400714785074694791ULL #define PRIME64_2 14029467366897019727ULL #define PRIME64_3 1609587929392839161ULL #define PRIME64_4 9650029242287828579ULL #define PRIME64_5 2870177450012600261ULL //************************************** // Architecture Macros //************************************** typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; #ifndef XXH_CPU_LITTLE_ENDIAN // It is possible to define XXH_CPU_LITTLE_ENDIAN externally, for example using a compiler switch static const int one = 1; # define XXH_CPU_LITTLE_ENDIAN (*(char*)(&one)) #endif //************************************** // Macros //************************************** #define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(!!(c)) }; } // use only *after* variable declarations //**************************** // Memory reads //**************************** typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; FORCE_INLINE U32 XXH_readLE32_align(const U32* ptr, XXH_endianess endian, XXH_alignment align) { if (align==XXH_unaligned) return endian==XXH_littleEndian ? A32(ptr) : XXH_swap32(A32(ptr)); else return endian==XXH_littleEndian ? *ptr : XXH_swap32(*ptr); } FORCE_INLINE U32 XXH_readLE32(const U32* ptr, XXH_endianess endian) { return XXH_readLE32_align(ptr, endian, XXH_unaligned); } FORCE_INLINE U64 XXH_readLE64_align(const U64* ptr, XXH_endianess endian, XXH_alignment align) { if (align==XXH_unaligned) return endian==XXH_littleEndian ? A64(ptr) : XXH_swap64(A64(ptr)); else return endian==XXH_littleEndian ? *ptr : XXH_swap64(*ptr); } FORCE_INLINE U64 XXH_readLE64(const U64* ptr, XXH_endianess endian) { return XXH_readLE64_align(ptr, endian, XXH_unaligned); } //**************************** // Simple Hash Functions //**************************** FORCE_INLINE U32 XXH32_endian_align(const void* input, unsigned int len, U32 seed, XXH_endianess endian, XXH_alignment align) { const BYTE* p = (const BYTE*)input; const BYTE* bEnd = p + len; U32 h32; #define XXH_get32bits(p) XXH_readLE32_align((const U32*)p, endian, align) #ifdef XXH_ACCEPT_NULL_INPUT_POINTER if (p==NULL) { len=0; bEnd=p=(const BYTE*)(size_t)16; } #endif if (len>=16) { const BYTE* const limit = bEnd - 16; U32 v1 = seed + PRIME32_1 + PRIME32_2; U32 v2 = seed + PRIME32_2; U32 v3 = seed + 0; U32 v4 = seed - PRIME32_1; do { v1 += XXH_get32bits(p) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4; v2 += XXH_get32bits(p) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4; v3 += XXH_get32bits(p) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4; v4 += XXH_get32bits(p) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4; } while (p<=limit); h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); } else { h32 = seed + PRIME32_5; } h32 += (U32) len; while (p<=bEnd-4) { h32 += XXH_get32bits(p) * PRIME32_3; h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; p+=4; } while (p> 15; h32 *= PRIME32_2; h32 ^= h32 >> 13; h32 *= PRIME32_3; h32 ^= h32 >> 16; return h32; } U32 XXH32(const void* input, unsigned int len, U32 seed) { #if 0 // Simple version, good for code maintenance, but unfortunately slow for small inputs void* state = XXH32_init(seed); XXH32_update(state, input, len); return XXH32_digest(state); #else XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; # if !defined(XXH_USE_UNALIGNED_ACCESS) if ((((size_t)input) & 3) == 0) // Input is aligned, let's leverage the speed advantage { if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); else return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); } # endif if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); else return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); #endif } FORCE_INLINE U64 XXH64_endian_align(const void* input, unsigned int len, U64 seed, XXH_endianess endian, XXH_alignment align) { const BYTE* p = (const BYTE*)input; const BYTE* bEnd = p + len; U64 h64; #define XXH_get64bits(p) XXH_readLE64_align((const U64*)p, endian, align) #ifdef XXH_ACCEPT_NULL_INPUT_POINTER if (p==NULL) { len=0; bEnd=p=(const BYTE*)(size_t)32; } #endif if (len>=32) { const BYTE* const limit = bEnd - 32; U64 v1 = seed + PRIME64_1 + PRIME64_2; U64 v2 = seed + PRIME64_2; U64 v3 = seed + 0; U64 v4 = seed - PRIME64_1; do { v1 += XXH_get64bits(p) * PRIME64_2; p+=8; v1 = XXH_rotl64(v1, 31); v1 *= PRIME64_1; v2 += XXH_get64bits(p) * PRIME64_2; p+=8; v2 = XXH_rotl64(v2, 31); v2 *= PRIME64_1; v3 += XXH_get64bits(p) * PRIME64_2; p+=8; v3 = XXH_rotl64(v3, 31); v3 *= PRIME64_1; v4 += XXH_get64bits(p) * PRIME64_2; p+=8; v4 = XXH_rotl64(v4, 31); v4 *= PRIME64_1; } while (p<=limit); h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); v1 *= PRIME64_2; v1 = XXH_rotl64(v1, 31); v1 *= PRIME64_1; h64 ^= v1; h64 = h64 * PRIME64_1 + PRIME64_4; v2 *= PRIME64_2; v2 = XXH_rotl64(v2, 31); v2 *= PRIME64_1; h64 ^= v2; h64 = h64 * PRIME64_1 + PRIME64_4; v3 *= PRIME64_2; v3 = XXH_rotl64(v3, 31); v3 *= PRIME64_1; h64 ^= v3; h64 = h64 * PRIME64_1 + PRIME64_4; v4 *= PRIME64_2; v4 = XXH_rotl64(v4, 31); v4 *= PRIME64_1; h64 ^= v4; h64 = h64 * PRIME64_1 + PRIME64_4; } else { h64 = seed + PRIME64_5; } h64 += (U64) len; while (p<=bEnd-8) { U64 k1 = XXH_get64bits(p); k1 *= PRIME64_2; k1 = XXH_rotl64(k1,31); k1 *= PRIME64_1; h64 ^= k1; h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; p+=8; } #if 1 if (p<=bEnd-4) { h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; p+=4; } while (p> 33; h64 *= PRIME64_2; h64 ^= h64 >> 29; h64 *= PRIME64_3; h64 ^= h64 >> 32; return h64; } unsigned long long XXH64(const void* input, unsigned int len, unsigned long long seed) { XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; # if !defined(XXH_USE_UNALIGNED_ACCESS) if ((((size_t)input) & 7)==0) // Input is aligned, let's leverage the speed advantage { if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); else return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); } # endif if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); else return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); } //**************************** // Advanced Hash Functions //**************************** struct XXH_state32_t { U64 total_len; U32 seed; U32 v1; U32 v2; U32 v3; U32 v4; int memsize; char memory[16]; }; struct XXH_state64_t { U64 total_len; U64 seed; U64 v1; U64 v2; U64 v3; U64 v4; int memsize; char memory[32]; }; int XXH32_sizeofState(void) { XXH_STATIC_ASSERT(XXH32_SIZEOFSTATE >= sizeof(struct XXH_state32_t)); // A compilation error here means XXH32_SIZEOFSTATE is not large enough return sizeof(struct XXH_state32_t); } int XXH64_sizeofState(void) { XXH_STATIC_ASSERT(XXH64_SIZEOFSTATE >= sizeof(struct XXH_state64_t)); // A compilation error here means XXH64_SIZEOFSTATE is not large enough return sizeof(struct XXH_state64_t); } XXH_errorcode XXH32_resetState(void* state_in, U32 seed) { struct XXH_state32_t * state = (struct XXH_state32_t *) state_in; state->seed = seed; state->v1 = seed + PRIME32_1 + PRIME32_2; state->v2 = seed + PRIME32_2; state->v3 = seed + 0; state->v4 = seed - PRIME32_1; state->total_len = 0; state->memsize = 0; return XXH_OK; } XXH_errorcode XXH64_resetState(void* state_in, unsigned long long seed) { struct XXH_state64_t * state = (struct XXH_state64_t *) state_in; state->seed = seed; state->v1 = seed + PRIME64_1 + PRIME64_2; state->v2 = seed + PRIME64_2; state->v3 = seed + 0; state->v4 = seed - PRIME64_1; state->total_len = 0; state->memsize = 0; return XXH_OK; } void* XXH32_init (U32 seed) { void* state = XXH_malloc (sizeof(struct XXH_state32_t)); XXH32_resetState(state, seed); return state; } void* XXH64_init (unsigned long long seed) { void* state = XXH_malloc (sizeof(struct XXH_state64_t)); XXH64_resetState(state, seed); return state; } FORCE_INLINE XXH_errorcode XXH32_update_endian (void* state_in, const void* input, int len, XXH_endianess endian) { struct XXH_state32_t * state = (struct XXH_state32_t *) state_in; const BYTE* p = (const BYTE*)input; const BYTE* const bEnd = p + len; #ifdef XXH_ACCEPT_NULL_INPUT_POINTER if (input==NULL) return XXH_ERROR; #endif state->total_len += len; if (state->memsize + len < 16) // fill in tmp buffer { XXH_memcpy(state->memory + state->memsize, input, len); state->memsize += len; return XXH_OK; } if (state->memsize) // some data left from previous update { XXH_memcpy(state->memory + state->memsize, input, 16-state->memsize); { const U32* p32 = (const U32*)state->memory; state->v1 += XXH_readLE32(p32, endian) * PRIME32_2; state->v1 = XXH_rotl32(state->v1, 13); state->v1 *= PRIME32_1; p32++; state->v2 += XXH_readLE32(p32, endian) * PRIME32_2; state->v2 = XXH_rotl32(state->v2, 13); state->v2 *= PRIME32_1; p32++; state->v3 += XXH_readLE32(p32, endian) * PRIME32_2; state->v3 = XXH_rotl32(state->v3, 13); state->v3 *= PRIME32_1; p32++; state->v4 += XXH_readLE32(p32, endian) * PRIME32_2; state->v4 = XXH_rotl32(state->v4, 13); state->v4 *= PRIME32_1; p32++; } p += 16-state->memsize; state->memsize = 0; } if (p <= bEnd-16) { const BYTE* const limit = bEnd - 16; U32 v1 = state->v1; U32 v2 = state->v2; U32 v3 = state->v3; U32 v4 = state->v4; do { v1 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v1 = XXH_rotl32(v1, 13); v1 *= PRIME32_1; p+=4; v2 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v2 = XXH_rotl32(v2, 13); v2 *= PRIME32_1; p+=4; v3 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v3 = XXH_rotl32(v3, 13); v3 *= PRIME32_1; p+=4; v4 += XXH_readLE32((const U32*)p, endian) * PRIME32_2; v4 = XXH_rotl32(v4, 13); v4 *= PRIME32_1; p+=4; } while (p<=limit); state->v1 = v1; state->v2 = v2; state->v3 = v3; state->v4 = v4; } if (p < bEnd) { XXH_memcpy(state->memory, p, bEnd-p); state->memsize = (int)(bEnd-p); } return XXH_OK; } XXH_errorcode XXH32_update (void* state_in, const void* input, unsigned int len) { XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) return XXH32_update_endian(state_in, input, len, XXH_littleEndian); else return XXH32_update_endian(state_in, input, len, XXH_bigEndian); } FORCE_INLINE U32 XXH32_intermediateDigest_endian (void* state_in, XXH_endianess endian) { struct XXH_state32_t * state = (struct XXH_state32_t *) state_in; const BYTE * p = (const BYTE*)state->memory; BYTE* bEnd = (BYTE*)state->memory + state->memsize; U32 h32; if (state->total_len >= 16) { h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); } else { h32 = state->seed + PRIME32_5; } h32 += (U32) state->total_len; while (p<=bEnd-4) { h32 += XXH_readLE32((const U32*)p, endian) * PRIME32_3; h32 = XXH_rotl32(h32, 17) * PRIME32_4; p+=4; } while (p> 15; h32 *= PRIME32_2; h32 ^= h32 >> 13; h32 *= PRIME32_3; h32 ^= h32 >> 16; return h32; } U32 XXH32_intermediateDigest (void* state_in) { XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) return XXH32_intermediateDigest_endian(state_in, XXH_littleEndian); else return XXH32_intermediateDigest_endian(state_in, XXH_bigEndian); } U32 XXH32_digest (void* state_in) { U32 h32 = XXH32_intermediateDigest(state_in); XXH_free(state_in); return h32; } FORCE_INLINE XXH_errorcode XXH64_update_endian (void* state_in, const void* input, int len, XXH_endianess endian) { struct XXH_state64_t * state = (struct XXH_state64_t *) state_in; const BYTE* p = (const BYTE*)input; const BYTE* const bEnd = p + len; #ifdef XXH_ACCEPT_NULL_INPUT_POINTER if (input==NULL) return XXH_ERROR; #endif state->total_len += len; if (state->memsize + len < 32) // fill in tmp buffer { XXH_memcpy(state->memory + state->memsize, input, len); state->memsize += len; return XXH_OK; } if (state->memsize) // some data left from previous update { XXH_memcpy(state->memory + state->memsize, input, 32-state->memsize); { const U64* p64 = (const U64*)state->memory; state->v1 += XXH_readLE64(p64, endian) * PRIME64_2; state->v1 = XXH_rotl64(state->v1, 31); state->v1 *= PRIME64_1; p64++; state->v2 += XXH_readLE64(p64, endian) * PRIME64_2; state->v2 = XXH_rotl64(state->v2, 31); state->v2 *= PRIME64_1; p64++; state->v3 += XXH_readLE64(p64, endian) * PRIME64_2; state->v3 = XXH_rotl64(state->v3, 31); state->v3 *= PRIME64_1; p64++; state->v4 += XXH_readLE64(p64, endian) * PRIME64_2; state->v4 = XXH_rotl64(state->v4, 31); state->v4 *= PRIME64_1; p64++; } p += 32-state->memsize; state->memsize = 0; } if (p <= bEnd-32) { const BYTE* const limit = bEnd - 32; U64 v1 = state->v1; U64 v2 = state->v2; U64 v3 = state->v3; U64 v4 = state->v4; do { v1 += XXH_readLE64((const U64*)p, endian) * PRIME64_2; v1 = XXH_rotl64(v1, 31); v1 *= PRIME64_1; p+=8; v2 += XXH_readLE64((const U64*)p, endian) * PRIME64_2; v2 = XXH_rotl64(v2, 31); v2 *= PRIME64_1; p+=8; v3 += XXH_readLE64((const U64*)p, endian) * PRIME64_2; v3 = XXH_rotl64(v3, 31); v3 *= PRIME64_1; p+=8; v4 += XXH_readLE64((const U64*)p, endian) * PRIME64_2; v4 = XXH_rotl64(v4, 31); v4 *= PRIME64_1; p+=8; } while (p<=limit); state->v1 = v1; state->v2 = v2; state->v3 = v3; state->v4 = v4; } if (p < bEnd) { XXH_memcpy(state->memory, p, bEnd-p); state->memsize = (int)(bEnd-p); } return XXH_OK; } XXH_errorcode XXH64_update (void* state_in, const void* input, unsigned int len) { XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) return XXH64_update_endian(state_in, input, len, XXH_littleEndian); else return XXH64_update_endian(state_in, input, len, XXH_bigEndian); } FORCE_INLINE U64 XXH64_intermediateDigest_endian (void* state_in, XXH_endianess endian) { struct XXH_state64_t * state = (struct XXH_state64_t *) state_in; const BYTE * p = (const BYTE*)state->memory; BYTE* bEnd = (BYTE*)state->memory + state->memsize; U64 h64; if (state->total_len >= 32) { U64 v1 = state->v1; U64 v2 = state->v2; U64 v3 = state->v3; U64 v4 = state->v4; h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); v1 *= PRIME64_2; v1 = XXH_rotl64(v1, 31); v1 *= PRIME64_1; h64 ^= v1; h64 = h64*PRIME64_1 + PRIME64_4; v2 *= PRIME64_2; v2 = XXH_rotl64(v2, 31); v2 *= PRIME64_1; h64 ^= v2; h64 = h64*PRIME64_1 + PRIME64_4; v3 *= PRIME64_2; v3 = XXH_rotl64(v3, 31); v3 *= PRIME64_1; h64 ^= v3; h64 = h64*PRIME64_1 + PRIME64_4; v4 *= PRIME64_2; v4 = XXH_rotl64(v4, 31); v4 *= PRIME64_1; h64 ^= v4; h64 = h64*PRIME64_1 + PRIME64_4; } else { h64 = state->seed + PRIME64_5; } h64 += (U64) state->total_len; while (p<=bEnd-8) { U64 k1 = XXH_readLE64((const U64*)p, endian); k1 *= PRIME64_2; k1 = XXH_rotl64(k1,31); k1 *= PRIME64_1; h64 ^= k1; h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; p+=8; } if (p<=bEnd-4) { h64 ^= (U64)(XXH_readLE32((const U32*)p, endian)) * PRIME64_1; h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; p+=4; } while (p> 33; h64 *= PRIME64_2; h64 ^= h64 >> 29; h64 *= PRIME64_3; h64 ^= h64 >> 32; return h64; } unsigned long long XXH64_intermediateDigest (void* state_in) { XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) return XXH64_intermediateDigest_endian(state_in, XXH_littleEndian); else return XXH64_intermediateDigest_endian(state_in, XXH_bigEndian); } unsigned long long XXH64_digest (void* state_in) { U64 h64 = XXH64_intermediateDigest(state_in); XXH_free(state_in); return h64; } ================================================ FILE: algorithm/xxhash.h ================================================ /* xxHash - Extremely Fast Hash algorithm Header File Copyright (C) 2012-2014, Yann Collet. BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. You can contact the author at : - xxHash source repository : http://code.google.com/p/xxhash/ */ /* Notice extracted from xxHash homepage : xxHash is an extremely fast Hash algorithm, running at RAM speed limits. It also successfully passes all tests from the SMHasher suite. Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) Name Speed Q.Score Author xxHash 5.4 GB/s 10 CrapWow 3.2 GB/s 2 Andrew MumurHash 3a 2.7 GB/s 10 Austin Appleby SpookyHash 2.0 GB/s 10 Bob Jenkins SBox 1.4 GB/s 9 Bret Mulvey Lookup3 1.2 GB/s 9 Bob Jenkins SuperFastHash 1.2 GB/s 1 Paul Hsieh CityHash64 1.05 GB/s 10 Pike & Alakuijala FNV 0.55 GB/s 5 Fowler, Noll, Vo CRC32 0.43 GB/s 9 MD5-32 0.33 GB/s 10 Ronald L. Rivest SHA1-32 0.28 GB/s 10 Q.Score is a measure of quality of the hash function. It depends on successfully passing SMHasher test set. 10 is a perfect score. */ #pragma once #if defined (__cplusplus) extern "C" { #endif /***************************** Type *****************************/ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; /***************************** Simple Hash Functions *****************************/ unsigned int XXH32 (const void* input, unsigned int len, unsigned int seed); unsigned long long XXH64 (const void* input, unsigned int len, unsigned long long seed); /* XXH32() : Calculate the 32-bits hash of sequence of length "len" stored at memory address "input". The memory between input & input+len must be valid (allocated and read-accessible). "seed" can be used to alter the result predictably. This function successfully passes all SMHasher tests. Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s Note that "len" is type "int", which means it is limited to 2^31-1. If your data is larger, use the advanced functions below. XXH64() : Calculate the 64-bits hash of sequence of length "len" stored at memory address "input". */ /***************************** Advanced Hash Functions *****************************/ void* XXH32_init (unsigned int seed); XXH_errorcode XXH32_update (void* state, const void* input, unsigned int len); unsigned int XXH32_digest (void* state); void* XXH64_init (unsigned long long seed); XXH_errorcode XXH64_update (void* state, const void* input, unsigned int len); unsigned long long XXH64_digest (void* state); /* These functions calculate the xxhash of an input provided in several small packets, as opposed to an input provided as a single block. It must be started with : void* XXHnn_init() The function returns a pointer which holds the state of calculation. This pointer must be provided as "void* state" parameter for XXHnn_update(). XXHnn_update() can be called as many times as necessary. The user must provide a valid (allocated) input. The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. Note that "len" is type "int", which means it is limited to 2^31-1. If your data is larger, it is recommended to chunk your data into blocks of size for example 2^30 (1GB) to avoid any "int" overflow issue. Finally, you can end the calculation anytime, by using XXHnn_digest(). This function returns the final nn-bits hash. You must provide the same "void* state" parameter created by XXHnn_init(). Memory will be freed by XXHnn_digest(). */ int XXH32_sizeofState(void); XXH_errorcode XXH32_resetState(void* state, unsigned int seed); #define XXH32_SIZEOFSTATE 48 typedef struct { long long ll[(XXH32_SIZEOFSTATE+(sizeof(long long)-1))/sizeof(long long)]; } XXH32_stateSpace_t; int XXH64_sizeofState(void); XXH_errorcode XXH64_resetState(void* state, unsigned long long seed); #define XXH64_SIZEOFSTATE 88 typedef struct { long long ll[(XXH64_SIZEOFSTATE+(sizeof(long long)-1))/sizeof(long long)]; } XXH64_stateSpace_t; /* These functions allow user application to make its own allocation for state. XXHnn_sizeofState() is used to know how much space must be allocated for the xxHash nn-bits state. Note that the state must be aligned to access 'long long' fields. Memory must be allocated and referenced by a pointer. This pointer must then be provided as 'state' into XXHnn_resetState(), which initializes the state. For static allocation purposes (such as allocation on stack, or freestanding systems without malloc()), use the structure XXHnn_stateSpace_t, which will ensure that memory space is large enough and correctly aligned to access 'long long' fields. */ unsigned int XXH32_intermediateDigest (void* state); unsigned long long XXH64_intermediateDigest (void* state); /* This function does the same as XXHnn_digest(), generating a nn-bit hash, but preserve memory context. This way, it becomes possible to generate intermediate hashes, and then continue feeding data with XXHnn_update(). To free memory context, use XXHnn_digest(), or free(). */ #if defined (__cplusplus) } #endif ================================================ FILE: cache/rate_limiter.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_RATE_LIMITER_H_ #define KINGDB_RATE_LIMITER_H_ #include "util/debug.h" #include namespace kdb { class RateLimiter { public: RateLimiter(uint64_t rate_limit) : rate_limit_(rate_limit), rate_incoming_(250 * 1024 * 1024), rate_incoming_adjusted_(0), rate_writing_default_(1 * 1024 * 1024), epoch_last_(0), epoch_current_(0), duration_slept_(0), bytes_per_microsecond_(5) { } ~RateLimiter() { } void Tick(uint64_t bytes_incoming) { epoch_current_ = std::time(0); if (epoch_current_ != epoch_last_) { rate_incoming_adjusted_ = rate_incoming_ + bytes_per_microsecond_ * duration_slept_; log::trace("RateLimiter::Tick()", "rate_incoming_: %" PRIu64 " rate_incoming_adjusted_:%" PRIu64, rate_incoming_, rate_incoming_adjusted_); duration_slept_ = 0; rate_incoming_ = 0; epoch_last_ = epoch_current_; uint64_t rate_writing = GetWritingRate(); double ratio = (double)rate_incoming_adjusted_ / (double)rate_writing; if (ratio > 1.0) { // The rate of incoming data is greater than the rate at which data // can be written, therefore the number of bytes for each microsecond // slept must be decreased: this means that less bytes will trigger the // same amount of microseconds slept, which will increase the amount // of time spent sleeping, and bring the rate of incoming data closer // to the rate at which data is written. if (ratio > 1.50) { bytes_per_microsecond_ *= 0.75; } else if (ratio > 1.10) { bytes_per_microsecond_ *= 0.95; } else if (ratio > 1.05) { bytes_per_microsecond_ *= 0.99; } else { bytes_per_microsecond_ *= 0.995; } if (bytes_per_microsecond_ <= 5) bytes_per_microsecond_ += 1; log::trace("RateLimiter::Tick()", "decreasing"); } else { // The rate of incoming data is lower than the rate at which data // can be written, therefore bytes_per_microsecond_ needs to be // increased. if (ratio < 0.5) { bytes_per_microsecond_ *= 1.25; } else if (ratio < 0.90) { bytes_per_microsecond_ *= 1.05; } else if (ratio < 0.95) { bytes_per_microsecond_ *= 1.01; } else { bytes_per_microsecond_ *= 1.005; } log::trace("RateLimiter::Tick()", "increasing"); if (bytes_per_microsecond_ <= 5) bytes_per_microsecond_ += 1; } log::trace("RateLimiter::Tick()", "limit rate: bytes_per_microsecond_: %" PRIu64 " rate_writing:%" PRIu64, bytes_per_microsecond_, rate_writing); } mutex_throttling_.lock(); rate_incoming_ += bytes_incoming; uint64_t sleep_microseconds = 0; if (bytes_per_microsecond_ > 0) { sleep_microseconds = bytes_incoming / bytes_per_microsecond_; } mutex_throttling_.unlock(); if (sleep_microseconds > 50000) sleep_microseconds = 50000; if (sleep_microseconds) { log::trace("RateLimiter::Tick()", "bytes_per_microsecond_: %" PRIu64 ", sleep_microseconds: %" PRIu64, bytes_per_microsecond_, sleep_microseconds); std::chrono::microseconds duration(sleep_microseconds); std::this_thread::sleep_for(duration); duration_slept_ += sleep_microseconds; } } void WriteStart() { auto epoch = std::chrono::system_clock::now().time_since_epoch(); epoch_write_start_ = std::chrono::duration_cast(epoch).count(); } void WriteEnd(uint64_t num_bytes_written) { auto epoch = std::chrono::system_clock::now().time_since_epoch(); epoch_write_end_ = std::chrono::duration_cast(epoch).count(); uint64_t rate_writing = num_bytes_written; mutex_throttling_.lock(); if (epoch_write_start_ == epoch_write_end_) { rate_writing = num_bytes_written; } else { double duration = ((double)epoch_write_end_ - (double)epoch_write_start_) / 1000; rate_writing = (uint64_t)((double)num_bytes_written / (double)duration); } StoreWritingRate(rate_writing); mutex_throttling_.unlock(); } void StoreWritingRate(uint64_t rate) { if (rates_.size() >= 10) { rates_.erase(rates_.begin()); } rates_.push_back(rate); } uint64_t GetWritingRate() { // If no rate has been stored yet, a default value is used. if (rates_.size() == 0) return rate_writing_default_; // The writting rate is an average: this allows to cope with irregularities // in the throughput and prevent the rate limiter to fall into a flapping // effect, in which it would limit the throughput either way too much, // or not enough. This average allow the bandwitdth to converge smoothly. uint64_t sum = 0; for (size_t i = 0; i < rates_.size(); i++) { sum += rates_[i]; } uint64_t rate_limit_current = sum / rates_.size(); if (rate_limit_ > 0 && rate_limit_ < rate_limit_current) { return rate_limit_; } else if (rate_limit_current > 0){ return rate_limit_current; } else { return rate_writing_default_; } } uint64_t epoch_write_start_; uint64_t epoch_write_end_; uint64_t rate_limit_; uint64_t rate_incoming_; uint64_t rate_incoming_adjusted_; uint64_t rate_writing_; uint64_t rate_writing_default_; uint64_t epoch_last_; uint64_t epoch_current_; uint64_t duration_slept_; uint64_t bytes_per_microsecond_; std::mutex mutex_throttling_; std::vector rates_; }; } // namespace kdb #endif // KINGDB_RATE_LIMITER_H_ ================================================ FILE: cache/write_buffer.cc ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #include "cache/write_buffer.h" namespace kdb { void WriteBuffer::Flush() { std::unique_lock lock_flush(mutex_flush_level2_); if (IsStopRequestedAndBufferEmpty()) return; // NOTE: Doing the flushing and waiting twice, in case the two buffers, // 'live' and 'copy', have items. This is a quick hack and a better // solution should be investigated. for (auto i = 0; i < 2; i++) { log::debug("LOCK", "2 lock"); cv_flush_.notify_one(); cv_flush_done_.wait_for(lock_flush, std::chrono::milliseconds(db_options_.internal__close_timeout)); } log::trace("WriteBuffer::Flush()", "end"); } Status WriteBuffer::Get(ReadOptions& read_options, ByteArray& key, ByteArray* value_out) { // NOTE: The lookups is done by iterating over vectors, which is fine // as long as the buffer doesn't become too big. This may need to be // changed for something faster at some point. // TODO: make sure the live buffer doesn't need to be protected by a mutex in // order to be accessed -- right now I'm relying on timing, but that may // be too weak to guarantee proper access // TODO: for items being stored that are not small enough, only parts will // be found in the buffers -- should the kv-store return "not found" // or should it try to send the data from the disk and the partially // available parts in the buffer? if (IsStopRequested()) return Status::IOError("Cannot handle request: WriteBuffer is closing"); // read the "live" buffer mutex_live_write_level1_.lock(); log::debug("LOCK", "1 lock"); mutex_indices_level3_.lock(); log::debug("LOCK", "3 lock"); auto& buffer_live = buffers_[im_live_]; int num_items = buffer_live.size(); mutex_indices_level3_.unlock(); log::debug("LOCK", "3 unlock"); mutex_live_write_level1_.unlock(); log::debug("LOCK", "1 unlock"); bool found = false; Order order_found; for (int i = 0; i < num_items; i++) { auto& order = buffer_live[i]; if (order.key == key) { found = true; order_found = order; } } if (found) { log::debug("WriteBuffer::Get()", "found in buffer_live"); if ( order_found.type == OrderType::Put && order_found.IsSelfContained()) { *value_out = order_found.chunk; (*value_out).set_size(order_found.size_value); (*value_out).set_size_compressed(order_found.size_value_compressed); return Status::OK(); } else if (order_found.type == OrderType::Delete) { return Status::DeleteOrder(); } else { return Status::NotFound("Unable to find entry"); } } // prepare to read the "copy" buffer log::debug("LOCK", "4 lock"); mutex_copy_write_level4_.lock(); log::debug("LOCK", "5 lock"); mutex_copy_read_level5_.lock(); num_readers_ += 1; mutex_copy_read_level5_.unlock(); log::debug("LOCK", "5 unlock"); mutex_copy_write_level4_.unlock(); log::debug("LOCK", "4 unlock"); // read from "copy" buffer found = false; log::debug("LOCK", "3 lock"); mutex_indices_level3_.lock(); auto& buffer_copy = buffers_[im_copy_]; mutex_indices_level3_.unlock(); log::debug("LOCK", "3 unlock"); for (auto& order: buffer_copy) { if (order.key == key) { found = true; order_found = order; } } Status s; if (found) log::debug("WriteBuffer::Get()", "found in buffer_copy"); if ( found && order_found.type == OrderType::Put && order_found.IsSelfContained()) { *value_out = order_found.chunk; (*value_out).set_size(order_found.size_value); (*value_out).set_size_compressed(order_found.size_value_compressed); } else if ( found && order_found.type == OrderType::Delete) { s = Status::DeleteOrder(); } else { s = Status::NotFound("Unable to find entry"); } // exit the "copy" buffer log::debug("LOCK", "5 lock"); mutex_copy_read_level5_.lock(); num_readers_ -= 1; mutex_copy_read_level5_.unlock(); log::debug("LOCK", "5 unlock"); cv_read_.notify_one(); return s; } Status WriteBuffer::Put(WriteOptions& write_options, ByteArray& key, ByteArray& chunk) { //return Write(OrderType::Put, key, value); return Status::InvalidArgument("WriteBuffer::Put() is not implemented"); } Status WriteBuffer::PutPart(WriteOptions& write_options, ByteArray& key, ByteArray& chunk, uint64_t offset_chunk, uint64_t size_value, uint64_t size_value_compressed, uint32_t crc32) { return WritePart(write_options, OrderType::Put, key, chunk, offset_chunk, size_value, size_value_compressed, crc32 ); } Status WriteBuffer::Delete(WriteOptions& write_options, ByteArray& key) { auto empty = ByteArray::NewEmptyByteArray(); return WritePart(write_options, OrderType::Delete, key, empty, 0, 0, 0, 0); } Status WriteBuffer::WritePart(const WriteOptions& write_options, const OrderType& op, ByteArray& key, ByteArray& chunk, uint64_t offset_chunk, uint64_t size_value, uint64_t size_value_compressed, uint32_t crc32) { if (IsStopRequested()) return Status::IOError("Cannot handle request: WriteBuffer is closing"); log::trace("WriteBuffer::WritePart()", "key:[%s] | size chunk:%" PRIu64 ", total size value:%" PRIu64 " offset_chunk:%" PRIu64 " sizeOfBuffer:%d", key.ToString().c_str(), chunk.size(), size_value, offset_chunk, buffers_[im_live_].size()); bool is_first_part = (offset_chunk == 0); bool is_large = key.size() + size_value > db_options_.storage__hstable_size; uint64_t bytes_arriving = 0; if (is_first_part) bytes_arriving += key.size(); bytes_arriving += chunk.size(); if (UseRateLimiter()) rate_limiter_.Tick(bytes_arriving); // TODO: here the buffer index im_live_ is called outside of the level 2 and 3 mutexes is this really safe? log::debug("LOCK", "1 lock"); std::unique_lock lock_live(mutex_live_write_level1_); mutex_indices_level3_.lock(); buffers_[im_live_].push_back(Order{std::this_thread::get_id(), write_options, op, key, chunk, offset_chunk, size_value, size_value_compressed, crc32, is_large}); sizes_[im_live_] += bytes_arriving; uint64_t size_buffer_live = sizes_[im_live_]; mutex_indices_level3_.unlock(); /* if (buffers_[im_live_].size()) { for(auto &p: buffers_[im_live_]) { log::trace("WriteBuffer::WritePart()", "Write() ITEM key_ptr:[%p] key:[%s] | size chunk:%d, total size value:%d offset_chunk:%" PRIu64 " sizeOfBuffer:%d sizes_[im_live_]:%d", p.key, p.key->ToString().c_str(), p.chunk->size(), p.size_value, p.offset_chunk, buffers_[im_live_].size(), sizes_[im_live_]); } } else { log::trace("WriteBuffer::WritePart()", "Write() ITEM no buffers_[im_live_]"); } */ if (size_buffer_live > buffer_size_) { log::trace("WriteBuffer::WritePart()", "trying to swap"); mutex_flush_level2_.lock(); log::debug("LOCK", "2 lock"); log::debug("LOCK", "3 lock"); std::unique_lock lock_swap(mutex_indices_level3_); cv_flush_.notify_one(); log::debug("LOCK", "3 unlock"); mutex_flush_level2_.unlock(); log::debug("LOCK", "2 unlock"); } else { log::trace("WriteBuffer::WritePart()", "will not swap"); } log::debug("LOCK", "1 unlock"); return Status::OK(); } void WriteBuffer::ProcessingLoop() { while(true) { bool force_sync = false; log::trace("WriteBuffer", "ProcessingLoop() - start"); log::debug("LOCK", "2 lock"); std::unique_lock lock_flush(mutex_flush_level2_); while (sizes_[im_live_] == 0) { log::trace("WriteBuffer", "ProcessingLoop() - wait - %" PRIu64 " %" PRIu64, buffers_[im_copy_].size(), buffers_[im_live_].size()); std::cv_status status = cv_flush_.wait_for(lock_flush, std::chrono::milliseconds(db_options_.write_buffer__flush_timeout)); if (IsStopRequestedAndBufferEmpty()) return; if (status == std::cv_status::no_timeout) { force_sync = true; } } mutex_indices_level3_.lock(); if (sizes_[im_copy_] == 0) { std::swap(im_live_, im_copy_); } mutex_indices_level3_.unlock(); log::trace("WriteBuffer", "ProcessingLoop() - start swap - %" PRIu64 " %" PRIu64, buffers_[im_copy_].size(), buffers_[im_live_].size()); // Notify the storage engine that the buffer can be flushed log::trace("BM", "WAIT: Get()-flush_buffer"); if (UseRateLimiter()) rate_limiter_.WriteStart(); if (force_sync && buffers_[im_copy_].size()) { buffers_[im_copy_][0].write_options.sync = true; } event_manager_->flush_buffer.StartAndBlockUntilDone(buffers_[im_copy_]); // Wait for the index to notify the buffer manager log::trace("BM", "WAIT: Get()-clear_buffer"); event_manager_->clear_buffer.Wait(); event_manager_->clear_buffer.Done(); // Wait for readers // TODO: the cleaning of the flush buffer shouldn't be done in one go but // in multiple iterations just like the transfer of indexes is being done, // so that the readers are never blocked for too long. log::debug("LOCK", "4 lock"); mutex_copy_write_level4_.lock(); while(true) { log::debug("LOCK", "5 lock"); std::unique_lock lock_read(mutex_copy_read_level5_); if (num_readers_ == 0) break; log::debug("WriteBuffer", "ProcessingLoop() - wait for lock_read"); cv_read_.wait(lock_read); } log::debug("LOCK", "5 unlock"); if (UseRateLimiter()) rate_limiter_.WriteEnd(sizes_[im_copy_]); log::trace("WriteBuffer", "ProcessingLoop() bytes_in_buffer: %" PRIu64 " rate_writing: %" PRIu64, sizes_[im_copy_], rate_limiter_.GetWritingRate()); // Clear flush buffer log::debug("WriteBuffer::ProcessingLoop()", "clear flush buffer"); /* if (buffers_[im_copy_].size()) { for(auto &p: buffers_[im_copy_]) { log::trace("WriteBuffer", "ProcessingLoop() ITEM im_copy - key:[%s] | size chunk:%d, total size value:%d offset_chunk:%" PRIu64 " sizeOfBuffer:%d sizes_[im_copy_]:%d", p.key.ToString().c_str(), p.chunk.size(), p.size_value, p.offset_chunk, buffers_[im_copy_].size(), sizes_[im_copy_]); } } else { log::trace("WriteBuffer", "ProcessingLoop() ITEM no buffers_[im_copy_]"); } if (buffers_[im_live_].size()) { for(auto &p: buffers_[im_live_]) { log::trace("WriteBuffer", "ProcessingLoop() ITEM im_live - key:[%s] | size chunk:%d, total size value:%d offset_chunk:%" PRIu64 " sizeOfBuffer:%d sizes_[im_live_]:%d", p.key.ToString().c_str(), p.chunk.size(), p.size_value, p.offset_chunk, buffers_[im_live_].size(), sizes_[im_live_]); } } else { log::trace("WriteBuffer", "ProcessingLoop() ITEM no buffers_[im_live_]"); } */ // Note: the call to clear() can delete a lot of allocated memory at once, // which may block all other operations for a while: this may benefit // from throttling (using db_options_.internal__num_iterations_per_lock) sizes_[im_copy_] = 0; buffers_[im_copy_].clear(); log::trace("WriteBuffer", "ProcessingLoop() - end swap - %" PRIu64 " %" PRIu64, buffers_[im_copy_].size(), buffers_[im_live_].size()); mutex_copy_write_level4_.unlock(); log::debug("LOCK", "4 unlock"); log::debug("LOCK", "2 unlock"); cv_flush_done_.notify_all(); if (IsStopRequestedAndBufferEmpty()) return; } } } // namespace kdb ================================================ FILE: cache/write_buffer.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_WRITE_BUFFER_H_ #define KINGDB_WRITE_BUFFER_H_ #include "util/debug.h" #include #include #include #include #include #include #include #include #include "util/options.h" #include "util/byte_array.h" #include "util/order.h" #include "cache/rate_limiter.h" #include "thread/event_manager.h" namespace kdb { class WriteBuffer { public: WriteBuffer(const DatabaseOptions& db_options, EventManager *event_manager) : db_options_(db_options), event_manager_(event_manager), rate_limiter_(db_options.rate_limit_incoming) { stop_requested_ = false; im_live_ = 0; im_copy_ = 1; sizes_[im_live_] = 0; sizes_[im_copy_] = 0; num_readers_ = 0; buffer_size_ = db_options_.write_buffer__size / 2; thread_buffer_handler_ = std::thread(&WriteBuffer::ProcessingLoop, this); is_closed_ = false; log::debug("WriteBuffer::ctor()", "WriteBuffer::ctor() %" PRIu64 " - %s\n", db_options_.rate_limit_incoming, db_options_.write_buffer__mode_str.c_str()); if (db_options_.write_buffer__mode == kWriteBufferModeAdaptive) { log::debug("WriteBuffer::ctor()", "WriteBuffer::ctor() write buffer mode adaptive\n"); } else { log::debug("WriteBuffer::ctor()", "WriteBuffer::ctor() write buffer mode direct\n"); } } ~WriteBuffer() { Close(); } Status Get(ReadOptions& read_options, ByteArray& key, ByteArray* value_out); Status Put(WriteOptions& write_options, ByteArray& key, ByteArray& chunk); Status PutPart(WriteOptions& write_options, ByteArray& key, ByteArray& chunk, uint64_t offset_chunk, uint64_t size_value, uint64_t size_value_compressed, uint32_t crc32); Status Delete(WriteOptions& write_options, ByteArray& key); void Flush(); void Close () { std::unique_lock lock(mutex_close_); if (is_closed_) return; is_closed_ = true; Stop(); Flush(); cv_flush_.notify_one(); thread_buffer_handler_.join(); } bool UseRateLimiter() { if ( db_options_.write_buffer__mode == kWriteBufferModeAdaptive || db_options_.rate_limit_incoming > 0) { return true; } return false; } bool IsStopRequestedAndBufferEmpty() { return ( IsStopRequested() && buffers_[im_live_].empty() && buffers_[im_copy_].empty()); } bool IsStopRequested() { return stop_requested_; } void Stop() { stop_requested_ = true; } bool stop_requested_; private: Status WritePart(const WriteOptions& write_options, const OrderType& op, ByteArray& key, ByteArray& chunk, uint64_t offset_chunk, uint64_t size_value, uint64_t size_value_compressed, uint32_t crc32); void ProcessingLoop(); DatabaseOptions db_options_; int im_live_; int im_copy_; uint64_t buffer_size_; int num_readers_; std::array, 2> buffers_; std::array sizes_; bool is_closed_; std::mutex mutex_close_; std::thread thread_buffer_handler_; EventManager *event_manager_; RateLimiter rate_limiter_; // Using a lock hierarchy to avoid deadlocks std::mutex mutex_live_write_level1_; std::mutex mutex_flush_level2_; std::mutex mutex_indices_level3_; std::mutex mutex_copy_write_level4_; std::mutex mutex_copy_read_level5_; std::condition_variable cv_flush_; std::condition_variable cv_flush_done_; std::condition_variable cv_read_; }; } // namespace kdb #endif // KINGDB_WRITE_BUFFER_H_ ================================================ FILE: doc/bench/benchmarks.md ================================================ Benchmarks of KingDB v0.9.0 and LevelDB v1.18 ============================================= Date: April 9, 2015 ##TL;DR Benchmarks were run for different value sizes, from 100 bytes to 1MB, using the benchmark framework of LevelDB. The benchmark program for KingDB can be found in `doc/bench/db_bench_kingdb.cc` - Random writes: KingDB is faster than LevelDB by 2-10x. - Sequential writes: KingDB and LevelDB are equivalent. - Overwrites: KingDB is faster than LevelDB by 2-14x - Random reads: Before compaction, KingDB is faster than LevelDB by 1.5-3x. However after compaction, LevelDB is faster than KingDB by around 1.7x. - Sequential reads: LevelDB is faster than KingDB by 3-50x. Yes, KingDB is really bad for sequential reads. - Compaction: LevelDB has faster compactions than KingDB, by 3-4x. ##Description The benchmarks were made over two systems, a Linux CentOS 6.4 and a Mac OS X 10.9.5. ###Linux - Operating System: CentOS 6.4 - RAM: 128GB DDR3 - CPU: 12-core Intel Xeon E5-2640 @ 2.50GHz - Compiler: GCC 4.9.2 - Compiler and linker options: \-O2 \-fno-builtin-memcmp \-ltcmalloc - Storage: hard drive - File system: ext3 ###Mac - Operating System: OS X 10.9.5 - RAM: 8GB DDR3 - CPU: 4-core Intel Core i7 @ 2.3GHz - Compiler: Apple LLVM version 6.0 - Compiler and linker options: \-O2 - Storage: solid state drive - File system: HFS+ ###Key-value store options - LevelDB v1.18 has a write cache of 4MB, and uses Snappy v1.1.2 for compression. - KingDB v0.9.0 has a write cache of 4MB, uses LZ4 v1.3.0 for compression. ##Results For each workload, multiple value sizes are tested. The performance of LevelDB and KingDB are compared using [fold change](http://en.wikipedia.org/wiki/Fold_change) metrics. The first fold change column refers to the Linux sytem, and the second fold change column refers to the Mac system. A positive fold change means that KingDB is faster than LevelDB, and a negative fold change means that LevelDB is faster than KingDB. Except for the value size and the fold changes, the metric for all the values in table are in number of operations per second (ops/second). | Workload | Value size | LevelDB Linux | KingDB Linux | fold change | LevelDB Mac | KingDB Mac | fold change | | --------------: | ---------: | -------------: | -------------: | ----------: | -------------: | -------------: | ----------: | | fillseq | 100b | 265957 | 233808 | -1.14x | 224064 | 255885 | 1.14x | | fillseq | 1kb | 95721 | 99127 | 1.04x | 117827 | 151057 | 1.28x | | fillseq | 100kb | 1605 | 1827 | 1.14x | 1646 | 2953 | 1.79x | | fillseq | 256kb | 674 | 670 | -1.01x | 690 | 1303 | 1.89x | | fillseq | 512kb | 336 | 368 | 1.10x | 317 | 649 | 2.05x | | fillseq | 1mb | 169 | 180 | 1.07x | 140 | 313 | 2.24x | | fillrandom | 100b | 127551 | 251635 | 1.97x | 199560 | 254647 | 1.28x | | fillrandom | 1kb | 13063 | 104493 | 8.00x | 19738 | 148214 | 7.51x | | fillrandom | 100kb | 299 | 1821 | 6.09x | 303 | 3265 | 10.78x | | fillrandom | 256kb | 167 | 743 | 4.45x | 269 | 1274 | 4.74x | | fillrandom | 512kb | 97 | 375 | 3.87x | 85 | 654 | 7.69x | | fillrandom | 1mb | 38 | 184 | 4.84x | 34 | 331 | 9.74x | | overwrite | 100b | 127194 | 250689 | 1.97x | 176056 | 254971 | 1.45x | | overwrite | 1kb | 13582 | 108096 | 7.96x | 15211 | 142877 | 9.39x | | overwrite | 100kb | 229 | 1806 | 7.89x | 242 | 3462 | 14.31x | | overwrite | 256kb | 117 | 738 | 6.31x | 149 | 1308 | 8.78x | | overwrite | 512kb | 71 | 374 | 5.27x | 64 | 652 | 10.19x | | overwrite | 1mb | 23 | 186 | 8.09x | 25 | 334 | 13.36x | | readrandom | 100b | 147907 | 203417 | 1.38x | 130667 | 178890 | 1.37x | | readrandom | 1kb | 79057 | 130582 | 1.65x | 72228 | 91810 | 1.27x | | readrandom | 100kb | 4211 | 9480 | 2.25x | 4700 | 11843 | 2.52x | | readrandom | 256kb | 2812 | 4054 | 1.44x | 4480 | 5399 | 1.21x | | readrandom | 512kb | 737 | 2122 | 2.88x | 925 | 2567 | 2.78x | | readrandom | 1mb | 636 | 954 | 1.50x | 628 | 1237 | 1.97x | | readseq | 100b | 3289473 | 104482 | -31.48x | 3649635 | 76952 | -47.43x | | readseq | 1kb | 660501 | 83187 | -7.94x | 930232 | 55509 | -16.76x | | readseq | 100kb | 12761 | 5485 | -2.33x | 20774 | 8499 | -2.44x | | readseq | 256kb | 6208 | 2116 | -2.93x | 7550 | 3767 | -2.00x | | readseq | 512kb | 4145 | 1308 | -3.17x | 4862 | 1939 | -2.51x | | readseq | 1mb | 2047 | 531 | -3.85x | 1750 | 930 | -1.88x | The benchmarks for the random and sequential reads were re-run after a compaction process: | Workload | Value size | LevelDB Linux | KingDB Linux | fold change | LevelDB Mac | KingDB Mac | fold change | | --------------: | ---------: | -------------: | -------------: | ----------: | -------------: | -------------: | ----------: | | readrandom | 100b | 202306 | 219925 | 1.09x | 195083 | 183049 | -1.07x | | readrandom | 1kb | 192752 | 152858 | -1.26x | 173190 | 148875 | -1.16x | | readrandom | 100kb | 13863 | 8679 | -1.60x | 19809 | 12987 | -1.53x | | readrandom | 256kb | 4285 | 2422 | -1.77x | 8890 | 5462 | -1.63x | | readrandom | 512kb | 2784 | 1704 | -1.63x | 4623 | 2788 | -1.66x | | readrandom | 1mb | 1461 | 843 | -1.73x | 2402 | 1390 | -1.73x | | readseq | 100b | 3717472 | 489715 | -7.59x | 4366812 | 239923 | -18.20x | | readseq | 1kb | 1242236 | 320102 | -3.88x | 1287001 | 188714 | -6.82x | | readseq | 100kb | 26271 | 7537 | -3.49x | 32398 | 9355 | -3.46x | | readseq | 256kb | 11467 | 2050 | -5.59x | 13220 | 3799 | -3.48x | | readseq | 512kb | 2871 | 1191 | -2.41x | 7270 | 2006 | -3.62x | | readseq | 1mb | 3021 | 683 | -4.42x | 3274 | 973 | -3.36x | ================================================ FILE: doc/bench/db_bench_kingdb.cc ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. // The code below was copied from LevelDB and adapted for use with KingDB. // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. // To compile this benchmark, copy this file into the LevelDB repository // at the location: doc/bench/db_bench_kingdb.cc // // Then update the LevelDB Makefile: // - Add "-I/usr/local/include/kingdb" to CFLAGS and CXXFLAGS. // - Add the following rule: // db_bench_kingdb: doc/bench/db_bench_kingdb.o $(LIBOBJECTS) $(TESTUTIL) // $(CXX) $(LDFLAGS) doc/bench/db_bench_kingdb.o $(LIBOBJECTS) $(TESTUTIL) -o $@ -L/usr/local/lib -lpthread -lkingdb $(LIBS) // // From the LevelDB root directory, just run: 'make db_bench_kingdb'. #include #include #include "util/histogram.h" #include "util/random.h" #include "util/testutil.h" #include // Comma-separated list of operations to run in the specified order // Actual benchmarks: // // fillseq -- write N values in sequential key order in async mode // fillrandom -- write N values in random key order in async mode // overwrite -- overwrite N values in random key order in async mode // fillseqsync -- write N/100 values in sequential key order in sync mode // fillrandsync -- write N/100 values in random key order in sync mode // fillrand100K -- write N/1000 100K values in random order in async mode // fillseq100K -- write N/1000 100K values in seq order in async mode // readseq -- read N times sequentially // readseq100K -- read N/1000 100K values in sequential order in async mode // readrand100K -- read N/1000 100K values in sequential order in async mode // readrandom -- read N times in random order static const char* FLAGS_benchmarks = "fillseq," "fillseqsync," "fillrandsync," "fillrandom," "overwrite," "readrandom," "readseq," "compact," "readrandom," "readseq," //"fillrand100K," //"fillseq100K," //"readseq100K," //"readrand100K," ; // Number of key/values to place in database static int FLAGS_num = 1000000; // Number of read operations to do. If negative, do FLAGS_num reads. static int FLAGS_reads = -1; // Size of each value static int FLAGS_value_size = 100; // Arrange to generate values that shrink to this fraction of // their original size after compression static double FLAGS_compression_ratio = 0.5; // Print histogram of operation timings static bool FLAGS_histogram = false; // Cache size. Default 4 MB static int FLAGS_cache_size = 4194304; // Page size. Default 1 KB static int FLAGS_page_size = 1024; // ignored in KingDB // If true, do not destroy the existing database. If you set this // flag and also specify a benchmark that wants a fresh database, that // benchmark will fail. static bool FLAGS_use_existing_db = false; // Compression flag. If true, compression is on. If false, compression // is off. static bool FLAGS_compression = true; // Use the db with the following name. static const char* FLAGS_db = NULL; inline static void DBSynchronize(kdb::KingDB* db_) { // Flush will flush writes to disk db_->Flush(); } inline static void Compact(kdb::KingDB* db_) { db_->Compact(); } namespace leveldb { // Helper for quickly generating random data. namespace { class RandomGenerator { private: std::string data_; int pos_; public: RandomGenerator() { // We use a limited amount of data over and over again and ensure // that it is larger than the compression window (32KB), and also // large enough to serve all typical value sizes we want to write. Random rnd(301); std::string piece; while (data_.size() < 1048576) { // Add a short fragment that is as compressible as specified // by FLAGS_compression_ratio. test::CompressibleString(&rnd, FLAGS_compression_ratio, 100, &piece); data_.append(piece); } pos_ = 0; } kdb::ByteArray Generate(int len) { if (pos_ + len > data_.size()) { pos_ = 0; assert(len < data_.size()); } pos_ += len; return kdb::ByteArray::NewPointerByteArray(data_.data() + pos_ - len, len); } }; static Slice TrimSpace(Slice s) { int start = 0; while (start < s.size() && isspace(s[start])) { start++; } int limit = s.size(); while (limit > start && isspace(s[limit-1])) { limit--; } return Slice(s.data() + start, limit - start); } } // namespace class Benchmark { private: kdb::Database* db_; int db_num_; int num_; int reads_; double start_; double last_op_finish_; int64_t bytes_; std::string message_; Histogram hist_; RandomGenerator gen_; Random rand_; // State kept for progress messages int done_; int next_report_; // When to report next void PrintHeader() { const int kKeySize = 16; PrintEnvironment(); fprintf(stdout, "Keys: %d bytes each\n", kKeySize); fprintf(stdout, "Values: %d bytes each (%d bytes after compression)\n", FLAGS_value_size, static_cast(FLAGS_value_size * FLAGS_compression_ratio + 0.5)); fprintf(stdout, "Entries: %d\n", num_); fprintf(stdout, "RawSize: %.1f MB (estimated)\n", ((static_cast(kKeySize + FLAGS_value_size) * num_) / 1048576.0)); fprintf(stdout, "FileSize: %.1f MB (estimated)\n", (((kKeySize + FLAGS_value_size * FLAGS_compression_ratio) * num_) / 1048576.0)); PrintWarnings(); fprintf(stdout, "------------------------------------------------\n"); } void PrintWarnings() { #if defined(__GNUC__) && !defined(__OPTIMIZE__) fprintf(stdout, "WARNING: Optimization is disabled: benchmarks unnecessarily slow\n" ); #endif #ifndef NDEBUG fprintf(stdout, "WARNING: Assertions are enabled; benchmarks unnecessarily slow\n"); #endif } void PrintEnvironment() { fprintf(stderr, "KingDB version: %d.%d.%d - Data format version: %d.%d\n", kdb::kVersionMajor, kdb::kVersionMinor, kdb::kVersionRevision, kdb::kVersionDataFormatMajor, kdb::kVersionDataFormatMinor); #if defined(__linux) time_t now = time(NULL); fprintf(stderr, "Date: %s", ctime(&now)); // ctime() adds newline FILE* cpuinfo = fopen("/proc/cpuinfo", "r"); if (cpuinfo != NULL) { char line[1000]; int num_cpus = 0; std::string cpu_type; std::string cache_size; while (fgets(line, sizeof(line), cpuinfo) != NULL) { const char* sep = strchr(line, ':'); if (sep == NULL) { continue; } Slice key = TrimSpace(Slice(line, sep - 1 - line)); Slice val = TrimSpace(Slice(sep + 1)); if (key == "model name") { ++num_cpus; cpu_type = val.ToString(); } else if (key == "cache size") { cache_size = val.ToString(); } } fclose(cpuinfo); fprintf(stderr, "CPU: %d * %s\n", num_cpus, cpu_type.c_str()); fprintf(stderr, "CPUCache: %s\n", cache_size.c_str()); } #endif } void Start() { start_ = Env::Default()->NowMicros() * 1e-6; bytes_ = 0; message_.clear(); last_op_finish_ = start_; hist_.Clear(); done_ = 0; next_report_ = 100; } void FinishedSingleOp() { if (FLAGS_histogram) { double now = Env::Default()->NowMicros() * 1e-6; double micros = (now - last_op_finish_) * 1e6; hist_.Add(micros); if (micros > 20000) { fprintf(stderr, "long op: %.1f micros%30s\r", micros, ""); fflush(stderr); } last_op_finish_ = now; } done_++; if (done_ >= next_report_) { if (next_report_ < 1000) next_report_ += 100; else if (next_report_ < 5000) next_report_ += 500; else if (next_report_ < 10000) next_report_ += 1000; else if (next_report_ < 50000) next_report_ += 5000; else if (next_report_ < 100000) next_report_ += 10000; else if (next_report_ < 500000) next_report_ += 50000; else next_report_ += 100000; fprintf(stderr, "... finished %d ops%30s\r", done_, ""); fflush(stderr); } } void Stop(const Slice& name) { double finish = Env::Default()->NowMicros() * 1e-6; // Pretend at least one op was done in case we are running a benchmark // that does not call FinishedSingleOp(). if (done_ < 1) done_ = 1; if (bytes_ > 0) { char rate[100]; snprintf(rate, sizeof(rate), "%6.1f MB/s", (bytes_ / 1048576.0) / (finish - start_)); if (!message_.empty()) { message_ = std::string(rate) + " " + message_; } else { message_ = rate; } } fprintf(stdout, "%-12s : %11.3f micros/op;%s%s\n", name.ToString().c_str(), (finish - start_) * 1e6 / done_, (message_.empty() ? "" : " "), message_.c_str()); if (FLAGS_histogram) { fprintf(stdout, "Microseconds per op:\n%s\n", hist_.ToString().c_str()); } fflush(stdout); } public: enum Order { SEQUENTIAL, RANDOM }; enum DBState { FRESH, EXISTING }; Benchmark() : db_(NULL), num_(FLAGS_num), reads_(FLAGS_reads < 0 ? FLAGS_num : FLAGS_reads), bytes_(0), rand_(301) { std::vector files; std::string test_dir; Env::Default()->GetTestDirectory(&test_dir); Env::Default()->GetChildren(test_dir.c_str(), &files); if (!FLAGS_use_existing_db) { for (int i = 0; i < files.size(); i++) { if (Slice(files[i]).starts_with("dbbench_polyDB")) { std::string file_name(test_dir); file_name += "/"; file_name += files[i]; Env::Default()->DeleteFile(file_name.c_str()); } } } } ~Benchmark() { db_->Close(); } void Run() { PrintHeader(); Open(false); const char* benchmarks = FLAGS_benchmarks; while (benchmarks != NULL) { const char* sep = strchr(benchmarks, ','); Slice name; if (sep == NULL) { name = benchmarks; benchmarks = NULL; } else { name = Slice(benchmarks, sep - benchmarks); benchmarks = sep + 1; } Start(); bool known = true; bool write_sync = false; if (name == Slice("fillseq")) { Write(write_sync, SEQUENTIAL, FRESH, num_, FLAGS_value_size, 1); } else if (name == Slice("fillrandom")) { Write(write_sync, RANDOM, FRESH, num_, FLAGS_value_size, 1); } else if (name == Slice("overwrite")) { Write(write_sync, RANDOM, EXISTING, num_, FLAGS_value_size, 1); } else if (name == Slice("compact")) { Compact(db_); } else if (name == Slice("fillrandsync")) { write_sync = true; Write(write_sync, RANDOM, FRESH, num_ / 100, FLAGS_value_size, 1); DBSynchronize(db_); } else if (name == Slice("fillseqsync")) { write_sync = true; Write(write_sync, SEQUENTIAL, FRESH, num_ / 100, FLAGS_value_size, 1); DBSynchronize(db_); } else if (name == Slice("fillrand100K")) { Write(write_sync, RANDOM, FRESH, num_ / 1000, 100 * 1000, 1); } else if (name == Slice("fillseq100K")) { Write(write_sync, SEQUENTIAL, FRESH, num_ / 1000, 100 * 1000, 1); } else if (name == Slice("readseq")) { ReadSequential(); } else if (name == Slice("readrandom")) { ReadRandom(); } else if (name == Slice("readrand100K")) { int n = reads_; reads_ /= 1000; ReadRandom(); reads_ = n; } else if (name == Slice("readseq100K")) { int n = reads_; reads_ /= 1000; ReadSequential(); reads_ = n; } else { known = false; if (name != Slice()) { // No error message for empty name fprintf(stderr, "unknown benchmark '%s'\n", name.ToString().c_str()); } } if (known) { Stop(name); } } } private: void EraseDB(char *dbname) { struct dirent *entry; DIR *dir; char filepath[512]; struct stat info; if (stat(dbname, &info) != 0) return; dir = opendir(dbname); while ((entry = readdir(dir)) != nullptr) { sprintf(filepath, "%s/%s", dbname, entry->d_name); std::remove(filepath); } rmdir(dbname); } void Open(bool sync) { assert(db_ == NULL); // Initialize db_ kdb::DatabaseOptions db_options; if (FLAGS_compression) { db_options.compression = kdb::kLZ4Compression; } else { db_options.compression = kdb::kNoCompression; } db_options.storage__hstable_size = 1024 * 1024 * 128; db_options.write_buffer__size = FLAGS_cache_size; char file_name[100]; db_num_++; std::string test_dir; Env::Default()->GetTestDirectory(&test_dir); snprintf(file_name, sizeof(file_name), "%s/kdb-benchmark-%d", test_dir.c_str(), db_num_); char* dirpath = const_cast(test_dir.c_str()); EraseDB(dirpath); db_ = new kdb::Database(db_options, dirpath); db_->Open(); kdb::Logger::set_current_level("emerg"); write_options_ = kdb::WriteOptions(); write_options_.sync = sync; } void Write(bool sync, Order order, DBState state, int num_entries, int value_size, int entries_per_batch) { // Create new database if state == FRESH if (state == FRESH) { if (FLAGS_use_existing_db) { message_ = "skipping (--use_existing_db is true)"; return; } delete db_; db_ = NULL; Open(sync); Start(); // Do not count time taken to destroy/open } if (num_entries != num_) { char msg[100]; snprintf(msg, sizeof(msg), "(%d ops)", num_entries); message_ = msg; } // Write to database for (int i = 0; i < num_entries; i++) { const int k = (order == SEQUENTIAL) ? i : (rand_.Next() % num_entries); char key[100]; snprintf(key, sizeof(key), "%016d", k); bytes_ += value_size + strlen(key); kdb::ByteArray ba_key = kdb::ByteArray::NewDeepCopyByteArray(key, strlen(key)); kdb::ByteArray ba_value = gen_.Generate(value_size); db_->Put(write_options_, ba_key, ba_value); FinishedSingleOp(); } } void ReadSequential() { kdb::ReadOptions read_options; kdb::Iterator iterator = db_->NewIterator(read_options); iterator.Begin(); Start(); // Do not count time taken to destroy/open for (; iterator.IsValid(); iterator.Next()) { kdb::ByteArray key = iterator.GetKey(); kdb::ByteArray value = iterator.GetValue(); bytes_ += key.size() + value.size(); FinishedSingleOp(); } } void ReadRandom() { std::string value; kdb::ReadOptions read_options; for (int i = 0; i < reads_; i++) { char key[100]; const int k = rand_.Next() % reads_; snprintf(key, sizeof(key), "%016d", k); std::string key_cpp(key); kdb::ByteArray ba_value; db_->Get(read_options, key_cpp, &ba_value); bytes_ += key_cpp.size() + ba_value.size(); FinishedSingleOp(); } } kdb::WriteOptions write_options_; }; } // namespace leveldb int main(int argc, char** argv) { kdb::Logger::set_current_level("emerg"); std::string default_db_path; for (int i = 1; i < argc; i++) { double d; int n; char junk; if (leveldb::Slice(argv[i]).starts_with("--benchmarks=")) { FLAGS_benchmarks = argv[i] + strlen("--benchmarks="); } else if (sscanf(argv[i], "--compression_ratio=%lf%c", &d, &junk) == 1) { FLAGS_compression_ratio = d; } else if (sscanf(argv[i], "--histogram=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { FLAGS_histogram = n; } else if (sscanf(argv[i], "--num=%d%c", &n, &junk) == 1) { FLAGS_num = n; } else if (sscanf(argv[i], "--reads=%d%c", &n, &junk) == 1) { FLAGS_reads = n; } else if (sscanf(argv[i], "--value_size=%d%c", &n, &junk) == 1) { FLAGS_value_size = n; } else if (sscanf(argv[i], "--cache_size=%d%c", &n, &junk) == 1) { FLAGS_cache_size = n; } else if (sscanf(argv[i], "--page_size=%d%c", &n, &junk) == 1) { FLAGS_page_size = n; } else if (sscanf(argv[i], "--compression=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { FLAGS_compression = (n == 1) ? true : false; } else if (strncmp(argv[i], "--db=", 5) == 0) { FLAGS_db = argv[i] + 5; } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); } } // Choose a location for the test database if none given with --db= if (FLAGS_db == NULL) { leveldb::Env::Default()->GetTestDirectory(&default_db_path); default_db_path += "/dbbench"; FLAGS_db = default_db_path.c_str(); } leveldb::Benchmark benchmark; benchmark.Run(); return 0; } ================================================ FILE: doc/bench/generate_benchmarks_table.py ================================================ #!/usr/bin/python # This program reads outputs from the LevelDB benchmarks and generates # a table in Markdown format to compare the performance of LevelDB # and KingDB. # In the case of the comparison between LevelDB and KingDB, # it makes more sense to use a fold change rather than # a percentage change: # # http://en.wikipedia.org/wiki/Fold_change # http://en.wikipedia.org/wiki/Relative_change_and_difference import sys def fold_change(ldb, kdb): growth = 0 ldb = float(ldb) kdb = float(kdb) if (ldb > kdb): growth = - ldb / kdb else: growth = kdb / ldb return growth def perc_change(ldb, kdb): ldb = float(ldb) kdb = float(kdb) return (kdb - ldb) / ldb * 100.0 f = open(sys.argv[1], "r") lines_mac = f.readlines() f.close() f = open(sys.argv[2], "r") lines_linux = f.readlines() f.close() lines = lines_mac + lines_linux perf = {} database = None workload = None cpu = None size = None has_compaction_passed = False for line in lines: line = line.strip() items = line.split() if line.startswith('LevelDB'): database = 'leveldb' has_compaction_passed = False elif line.startswith('KingDB'): database = 'kingdb' has_compaction_passed = False if line.startswith('CPU:'): cpu = line.split(' ', 1)[1].strip() if cpu.startswith('24 *'): cpu = 'linux' elif cpu.startswith('mac_nodatacopy'): cpu = 'mac_nodatacopy' elif cpu.startswith('mac_ndc_snappy'): cpu = 'mac_ndc_snappy' elif cpu.startswith('mac'): cpu = 'mac' if line.startswith('Values'): size = items[1] if len(items) > 3 and items[3] == 'micros/op;': workload = items[0] ops = 1.0 / (float(items[2])/1000000.0) ops = int(ops) if workload == 'fillrandsync': workload = 'fillsync' if has_compaction_passed: workload += '-ac' if workload == "compact": has_compaction_passed = True if workload not in perf: perf[workload] = {} if size not in perf[workload]: perf[workload][size] = {} if database not in perf[workload][size]: perf[workload][size][database] = {} if cpu not in perf[workload][size][database]: perf[workload][size][database][cpu] = {} perf[workload][size][database][cpu] = ops sizes = {100: '100b', 1024: '1kb', 1024*100: '100kb', 1024*256: '256kb', 1024*512: '512kb', 1024*1024: '1mb' } line = "| %15s | %10s | %14s | %14s | %11s | %14s | %14s | %11s |" % ('Workload', 'Value size', 'LevelDB Linux', 'KingDB Linux', 'fold change', 'LevelDB Mac', 'KingDB Mac', 'fold change') print line line_separator = "| %015d | %010d | %014d | %014d | %011d | %014d | %014d | %011d |" % (0, 0, 0, 0, 0, 0, 0, 0) line_separator = line_separator.replace('0', '-') print line_separator for workload in ['fillseq', 'fillrandom', 'overwrite', 'readrandom', 'readseq', 'readrandom-ac', 'readseq-ac']: if any( substring in workload for substring in ['100K', 'snappy', 'crc32','acquire', 'reverse', 'fillseqsync', 'compact'] ): continue for size in [100, 1024, 1024*100, 1024*256, 1024*512, 1024*1024]: size_human = sizes[size] size = str(size) ldb_linux = -1 ldb_mac = -1 kdb_linux = -1 kdb_mac = -1 try: ldb_linux = perf[workload][size]['leveldb']['linux'] except: pass try: ldb_mac = perf[workload][size]['leveldb']['mac'] except: pass try: kdb_linux = perf[workload][size]['kingdb']['linux'] except: pass try: kdb_mac = perf[workload][size]['kingdb']['mac'] except: pass perc_linux = fold_change(ldb_linux, kdb_linux) perc_mac = fold_change(ldb_mac, kdb_mac) line = "| %15s | %10s | %14s | %14s | % 10.2fx | %14s | %14s | % 10.2fx |" % (workload, size_human, ldb_linux, kdb_linux, perc_linux, ldb_mac, kdb_mac, perc_mac) print line ================================================ FILE: doc/kingdb.md ================================================ Documentation of KingDB v0.9.0 ============================== ##Table of Contents **[1. Why use KingDB?](#1-why-use-kingdb)** **[2. How to install KingDB?](#2-how-to-install-kingdb)** **[3. How to compile programs with KingDB?](#3-how-to-compile-programs-with-kingdb)** **[4. Basic API usage](#4-basic-api-usage)** **[5. The ByteArray class](#5-the-bytearray-class)** **[6. Multipart API](#6-multipart-api)** **[7. Logging with Syslog](#7-logging-with-syslog)** **[8. Options](#8-options)** ##1. Why use KingDB? ###KingDB is simple The architecture, code, and data format of KingDB are simple. You do not need to be a system programming expert or a storage engineer to understand how it works and tune it to your needs. ###Fast for writes and random reads Under the hood, KingDB uses log-structured storage, which makes writes very fast. An in-memory hash table indexes entries making random reads very fast. ###Multipart API KingDB has a multipart API for both the reads and writes: you can access your data in small parts, without having to store all the data at once in memory, making it easy to work with large entries without killing the caches of your CPU or make your program timeout. ### Snapshots and Iterators KingDB has read-only snapshots, that offer consistent views of a database. KingDB also has iterators so you can iterate over all your entries. Under the hood, KingDB is just a hash table and memory mapped files, which means you cannot access your data ordered by key. If you need a database that can give fast sequential reads ordered by keys, you should checkout another key-value store called LevelDB. ### Background compaction Regularly, KingDB runs a compaction process in background to recover unused disk space and make sure the database is stored in the most compact way possible. Please note, compaction is not compression. ###Covered with unit tests The latest version of KingDB is 0.9.0, which is a beta version. All changes on KingDB are tested using unit tests. ###KingDB is single-machine only KingDB is **not** a distributed and replicated multi-node datastore such as Cassandra or Riak. KingDB was designed to be a storage engine that lives on a single machine. If the disk on that machine dies, you lose your data. Luckily, the data format is loose enough that it is easy to do incremental backups and keep a copy of your data on a secondary machine if you need to. ###The KingServer network interface You can access your data through a network interface using KingServer. For more information about KingServer, check out the [KingServer documentation](kingserver.md). ##2. How to install KingDB? KingDB has no external dependencies and has been tested on: - Mac OS X 10.9.5 with Apple LLVM version 6.0 (clang-600.0.51) - Linux Ubuntu 14.04 x64 with GCC 4.9.2 - Linux Ubuntu 15.04 x64 with GCC 4.9.2-10ubuntu13 - Linux CentOS 6.5 x86\_64 with GCC 4.9.2 Because KingDB uses C++11, you need GCC >=4.9.2 or Clang >=3.3. The following commands will compile KingDB as a static library, and will install it on your computer. This will also install the `kingserver` program. $ tar zxvf kingdb.tar.gz $ cd kingdb $ make $ sudo make install If you are using GCC, update the Makefile and add \-fno\-builtin\-memcmp in the CFLAGS, and if you have tcmalloc on your system, add \-ltcmalloc to the LDFLAGS. This will give you a nice performance speed\-up. ##3. How to compile programs with KingDB? Once you have compiled KingDB and installed the static library by following the instructions above, you can compile your own programs by linking them with the KingDB library. With LLVM: $ g++ -std=c++11 -I/usr/local/include/kingdb -lkingdb kingdb_user.cc -o kingdb_user With GCC: $ g++ -std=c++11 -I/usr/local/include/kingdb kingdb_user.cc -o kingdb_user -Wl,--no-as-needed -L/usr/local/lib -lpthread -lkingdb For an example of what a user program would look like, you can refer to the `kingdb_user.cc` file in the unit-tests directory. ##4. Basic API usage ###No pointers, everything is an object You don't need to worry about memory management. KingDB will never return a pointer, and will never force you to maintain a pointer. All the API calls in KingDB return objects, and therefore the memory, file descriptors and other resources used by those objects will all be released when the objects go out of scope. ###The Status class The `Status` class is how KingDB deals with errors. All the methods in the KingDB API return a `Status` object. KingDB does not throw exceptions. Exceptions may still be thrown due to erroneous system calls, but KingDB does not catch them, because if they are thrown, it means that KingDB should die anyway. The `Status` class allows you to test if an error occurred, and if so, to print the relevant error message: kdb::Status s = ...; if (!s.IsOK()) cerr << s.ToString() << endl; ###Opening a database #include // Create a new database, which will be stored at the path "/tmp/mydb" kdb::DatabaseOptions db_options; kdb::Database db(db_options, "/tmp/mydb"); kdb::Status s = db.Open(); if (!s.IsOK()) cerr << s.ToString() << endl; db.Close(); // optional ###Reading, writing, and deleting an entry kdb::Status s; kdb::WriteOptions write_options; s = db.Put(write_options, “key1", "value1"); if (!s.IsOK()) cerr << s.ToString() << endl; kdb::ReadOptions read_options; std::string value_out; s = db.Get(read_options, “key1", &value_out); if (!s.IsOK()) cerr << s.ToString() << endl; s = db.Delete("key1"); if (!s.IsOK()) cerr << s.ToString() << endl; **IMPORTANT:** If you need to store and retrieve entries larger than 1MB, read carefully the section about the [multipart API](#6-multipart-api). ###Syncing writes You can sync writes to the secondary storage by setting the `sync` parameter in `WriteOptions`, which is false by default: kdb::WriteOptions write_options; write_options.sync = true; kdb::Status s = db.Put(write_options, “key1", "value1"); if (!s.IsOK()) cerr << s.ToString() << endl; ###Verifying checksums A unique checksum is stored with each entry when it is persisted to secondary storage. By default, these checksums are not verified, but you can choose to verify these checksums when reading entries, by setting the `verify_checksums` parameter in `ReadOptions`, which is false by default: kdb::ReadOptions read_options; read_options.verify_checksums = true; std::string value_out; s = db.Get(read_options, “key1", &value_out); if (!s.IsOK()) cerr << s.ToString() << endl; ###Closing a database You can just let the `Database` object go out of scope, which will close it. If you need to access a `Database` with a pointer, deleting the pointer will close the database. // Example 1: Explicitly closing the database (not required) kdb::DatabaseOptions db_options; kdb::Database db(db_options, "mydb"); kdb::Status s = db.Open(); if (!s.IsOK()) cerr << s.ToString() << endl; db.Close(); // Example 2: The destructor of Database will call Close() when db goes out of scope { kdb::DatabaseOptions db_options; kdb::Database db(db_options, "mydb"); kdb::Status s = db.Open(); if (!s.IsOK()) cerr << s.ToString() << endl; } // Example 3: When using a pointer kdb::DatabaseOptions db_options; kdb::Database* db = new kdb::Database(db_options, "mydb"); kdb::Status s = db->Open(); if (!s.IsOK()) cerr << s.ToString() << endl; delete db; // the destructor of Database will call Close() **IMPORTANT:** Never create a `Database` with the `new` operator, unless you really need pointer semantics. ###Compression Compression is enabled by default, using the [LZ4 algorithm](https://github.com/Cyan4973/lz4). The compression option affects the behavior of an entire `Database`: there is no option to compress some entries and keep the other uncompressed, it’s all or nothing. The compression parameter can be `kNoCompression` or `kLZ4Compression`. For example, the following code creates a `Database` with compression disabled: kdb::DatabaseOptions db_options; db_options.compression = kdb::kNoCompression; kdb::Database db(db_options, "mydb"); db.Open(); ###Snapshots You can get a read-only, consistent view of the `Database` using a `Snapshot`: kdb::Snapshot snapshot = db.NewSnapshot(); std::string value_out; kdb::Status s = snapshot.Get("key1", &value_out); if (!s.IsOK()) cerr << s.ToString() << endl; ###Database and Snapshot interface You can use the KingDB abstract class if you want pass either a `Database` or a `Snapshot`: kdb::DatabaseOptions db_options; kdb::Database db(db_options, "mydb"); kdb::Status s = db.Open(); if (!s.IsOK()) cerr << s.ToString() << endl; kdb::Snapshot snapshot = db.NewSnapshot(); kdb::KingDB* mydb; if (condition) { mydb = &db; } else { mydb = &snapshot; } std::string value_out; kdb::Status s = mydb->Get("key1", &value_out); if (!s.IsOK()) cerr << s.ToString() << endl; ###Iterators Iterating over all the entries of a `Database` or a `Snapshot` can be done with the `Iterator` class. kdb::Iterator it = db.NewIterator(); for(it.Begin(); it.IsValid(), it.Next()) { kdb::ByteArray key, value; key = it.GetKey(); value = it.GetValue(); } ###Working with the ByteArray class kdb::ByteArray key, value; // The data of a ByteArray can be transformed into std::string std::string key_str = key.ToString(); // The data of a ByteArray can also be accessed as a C char array char* data = value.data(); uint64_t size = value.size(); for (auto i = 0; i < size; ++i) { do_something(data[i]); } More information can be found in the [ByteArray section](#5-the-bytearray-class). ###Compaction You can trigger a compaction process to compact all the data and make a `Database` smaller. kdb::Status s = db.Compact(); if (!s.IsOK()) cerr << s.ToString() << endl; ### Flushing and Syncing a database You can force all KingDB's internal buffers to be flushed and synced to disk. kdb::Status s = db.Flush(); if (!s.IsOK()) cerr << s.ToString() << endl; ##5. The ByteArray class The `ByteArray` class allows to abstract the access to arbitrary arrays of bytes. The array can be allocated memory, a memory-mapped file, a shared memory, etc., it will all be transparent through the use of `ByteArray`. kdb::ByteArray ba; char* mydata = ba.data(); // char* to the memory location uint64_t mysize = ba.size(); // size of the data found at the pointed memory location `ByteArray` objects can be assigned, returned, and passed by value. Inside, a reference counter guarantees that the resources they hold will stay alive for as long as needed. ###Deep-copy ByteArray The deep-copy `ByteArray` will allocate memory and copy the memory buffer it was passed. char* mybuffer = new char[1024]; FillWithRandomContent(mybuffer, 1024); kdb::ByteArray ba = kdb::NewDeepCopyByteArray(buffer, 1024); delete[] mybuffer; ba.data(); // 'ba' holds its own copy of the data, so the data // is still reachable even though 'mybuffer' was deleted. ###Shallow-copy ByteArray The shallow-copy `ByteArray` will become the owner of the memory address it was passed. char* mybuffer = new char[1024]; FillWithRandomContent(mybuffer, 1024); kdb::ByteArray ba = kdb::NewShallowCopyByteArray(mybuffer, 1024); ba.data() // 'ba' now owns the allocated memory pointed by 'mybuffer'. // When 'ba' will be destroyed, it will release that memory. ###Memory-mapped ByteArray If you want to read data from a file and used it as a `ByteArray`, you can simply let a `ByteArray` mmap() that file for you. std::string filepath("/tmp/myfile"); uint64_t filesize = 1024; kdb::ByteArray ba = kdb::NewMmappedByteArray(filepath, filesize); ###Pointer ByteArray The pointer `ByteArray` will hold a pointer to a memory location, but will not own it. If that memory location happens to be destroyed before the `ByteArray` is accessed, the program will likely crash due to a memory access violation error. The pointer `ByteArray` is very useful when high performance is needed as it doesn't need any memory allocation or system calls, but you need to use it with care. char* mybuffer = new char[1024]; FillWithRandomContent(mybuffer, 1024); kdb::ByteArray ba = kdb::NewPointerByteArray(buffer, 1024); delete[] mybuffer; // Wrong: the delete will work, but any subsequent access to 'ba' // is likely to make the program crash, because it is a pointer // ByteArray, it does not own the memory it points to. ##6. Multipart API ###Reading entries in multiple parts Currently, all entries larger an 1MB must be read with the multipart API. Why 1MB? It is a totally arbitrary size. Below 1MB, the `Get()` method of `Database` will allocate memory and fill that memory with the correct data for the value of the entry, taking care of the decompression if needed. Above 1MB, `Get()` will refuse to return the value of the entry, because it is possible that the value is just too big to fit in memory, thus the checks prevents KingDB from crashing. In that case, KingDB forces the user to use the multipart API. Again, the 1MB value is completely arbitrary: it is just a rule of thumb. If you are unsure whether or not your entries are multipart, you can either check the returning `Status` of `Get()` and adapt your code appropriately, or just use the multipart API for all your entries. Indeed, *all* entries can be read using the multipart API, even the ones that were not stored with the multipart API. Thus if you don’t know if some of your entries are larger than 1MB, the first solution is check the return of `Get()`: kdb::ReadOptions read_options; std::string value_out; kdb::Status s = db.Get(read_options, key, &value_out); if (s.IsOK()) { do_something(value_out); } else if (s.IsMultipartRequired()) { kdb::MultipartReader mp_reader = db_->NewMultipartReader(read_options, key); for (mp_reader.Begin(); mp_reader.IsValid(); mp_reader.Next()) { kdb::ByteArray part; kdb::Status s = mp_reader.GetPart(&part); if (!s.IsOK()) { cerr << s.ToString() << endl; break; } do_something(part); } kdb::Status s = mp_reader.GetStatus(); if (!s.IsOK()) { cerr << s.ToString() << endl; } } else { cerr << s.ToString() << endl; } and the second solution is just to read all entries with the multipart API, which reduces the code: kdb::ReadOptions read_options; kdb::MultipartReader mp_reader = db_->NewMultipartReader(read_options, key); for (mp_reader.Begin(); mp_reader.IsValid(); mp_reader.Next()) { kdb::ByteArray part; kdb::Status s = mp_reader.GetPart(&part); if (!s.IsOK()) { cerr << s.ToString() << endl; break; } do_something(part); } kdb::Status s = mp_reader.GetStatus(); if (!s.IsOK()) cerr << s.ToString() << endl; ###Writing entries in multiple parts int total_size = 1024 * 1024 * 128; // 128MB char buffer[1024 * 1024 * 128]; kdb::MultipartWriter mp_writer = db_->NewMultipartWriter(write_options, key, total_size); int step = 1024 * 64; // 64KB steps for (auto i = 0; i < total_size; i += step) { kdb::ByteArray value = kdb::NewDeepCopyByteArray(buffer + i, step); kdb::Status s = mp_writer.PutPart(value); if (!s.IsOK()) { cerr << s.ToString() << endl; break; } } ###Multipart entries can be read in Iterators for (it.Begin(); it.IsValid(); it.Next()) { kdb::ByteArray it.GetKey(); kdb::MultipartReader mp_reader = it.GetMultipartValue(); for (mp_reader.Begin(); mp_reader.IsValid(); mp_reader.Next()) { kdb::ByteArray part; kdb::Status s = mp_reader.GetPart(&part); if (!s.IsOK()) { cerr << s.ToString() << endl; break; } } kdb::Status s = mp_reader.GetStatus(); if (!s.IsOK()) cerr << s.ToString() << endl; } ##7. Logging with Syslog ###Selecting a log level All the logging goes through [Syslog](http://en.wikipedia.org/wiki/Syslog), a protocol for message logging on Unix-based operating systems. The logging modules of KingDB and KingServer use Syslog to log activity and errors, and let the Syslog server on the machine handle storage and log rotation. KingDB emits log messages with different priority levels, following most the priority levels of Syslog: - silent: all logging is turned off - emerg: system is unusable, imminent crash - alert: error event, immediate action required - crit: error event, immediate action required - error: error event, action is required but is not urgent - warn: events that can be harmful if no action is taken - notice: unusual events, but no immediate action required - info: normal operation events, no action required - debug: events used for debugging, no action required - trace: fine-grained events used for debugging, no action required The default level is "info", and you can select the level of logging that you want with the kdb::Logger interface. For example, the following code set the logging level to “emerg”: kdb::Logger::set_current_level("emerg"); ###Dedicated log file By default, the log message will go to /var/log/system.log. You can also configure Syslog to store the KingDB and KingServer log messages to a dedicated log file on the machine. Below are examples of how to configure a Ubuntu server or a Mac OS X system to log all the messages emitted by KingDB to a dedicated file at the path /var/log/kingdb.log. ####On Ubuntu: 1. Open the rsyslog configuration file: $ sudo vim /etc/rsyslog.d/50-default.conf 2. Add a filter at the top of the file: :syslogtag, contains, "kingdb" /var/log/kingdb.log & ~ 3. Restart rsyslog $ sudo service rsyslog restart ####On Mac OS X (using the [FreeBSD Syslog configuration](https://www.freebsd.org/doc/handbook/configtuning-syslog.html)): 1. Open the syslog configuration file: $ sudo vim /etc/syslog.conf 2. Add a filter at the top: !kingdb *.* /var/log/kingdb.log !* 3. Restart syslogd: $ sudo killall -HUP syslog syslogd 4. If the restart command above does not work, try this: $ sudo launchctl unload /System/Library/LaunchDaemons/com.apple.syslogd.plist $ sudo launchctl load /System/Library/LaunchDaemons/com.apple.syslogd.plist ##8. Options ###ReadOptions `verify_checksums` When set to true, the reads will verify the checksums and return an error when a checksum mismatch is detected. Default value: False (Boolean) ###WriteOptions `sync` When set to true, the writes will be synced to secondary storage by calling fdatasync() on the file descriptor internally. Default value: False (Boolean) ###DatabaseOptions `create_if_missing` Will create the database if it does not already exists. Default value: true (Boolean) `error_if_exists` Will exit if the database already exists. Default value: false (Boolean) `compression` Compression algorithm used by the storage engine. Can be `kdb::kNoCompression` or `kdb::kLZ4Compression`. Default value: `kdb::kLZ4Compression` `hash` Hashing algorithm used by the storage engine. Can be `kdb::kxxHash_64` or `kdb::kMurmurHash3_64`. Default value: `kdb::kxxHash_64` `rate_limit_incoming` Limit the rate of incoming traffic, in bytes per second. Unlimited if equal to 0. Default value: 0 (Unsigned 64-bit integer) `write_buffer__size` Size of the Write Buffer. Default value: 64MB (Unsigned 64-bit integer) `write_buffer__flush_timeout` The timeout after which the write buffer will flush its cache. Default value: 500 milliseconds (Unsigned 64-bit integer) `write_buffer__mode` The mode with which the write buffer handles incoming traffic, can be `kdb::kWriteBufferModeDirect` or `kdb::kWriteBufferModeAdaptive`. With `kdb::kWriteBufferModeDirect`, once the Write Buffer is full other incoming Write and Delete operations will block until the buffer is persisted to secondary storage. The direct mode should be used when the clients are not subjects to timeouts. When choosing `kdb::kWriteBufferModeAdaptive`, incoming orders will be made slower, down to the speed of the writes on the secondary storage, so that they are almost just as fast as when using the direct mode, but are never blocking. The adaptive mode is expected to introduce a small performance decrease, but required for cases where clients timeouts must be avoided, for example when the database is used over a network. Default value: `kdb::kWriteBufferModeDirect` `storage__hstable_size` Maximum size a HSTable can have. Entries with keys and values beyond that size are considered to be large entries. Default value: 32MB (Unsigned 64-bit integer) `storage__minimum_free_space_accept_orders` Minimum free disk space required to accept incoming orders. It is recommended that for this value to be at least (2 x `write_buffer__size` + 4 x `storage__hstable_size`), so that when the file system fills up, the two write buffers can be flushed to secondary storage safely and the survival-mode compaction process can be run. Default value: 192MB (Unsigned 64-bit integer) `storage__maximum_part_size` The maximum part size is used by the storage engine to split entries into smaller parts -- important for the compression and hashing algorithms, can never be more than (2^32 - 1) as the algorihms used do not support sizes above that value. Default value: 1MB (Unsigned 64-bit integer) `storage__inactivity_streaming` The time of inactivity after which an entry stored with the streaming API is considered left for dead, and any subsequent incoming parts for that entry are rejected. Default value: 60 seconds (Unsigned 64-bit integer) `storage__statistics_polling_interval` The frequency at which statistics are polled in the Storage Engine (free disk space, etc.). Default value: 5 seconds (Unsigned 64-bit integer) `compaction__force_interval` Duration after which, if no compaction process has been performed, a compacted is started. Set to 0 to disable. Default value: 5 minutes (Unsigned 64-bit integer) `compaction__filesystem__free_space_required` Minimum free space on the file system required for a compaction process to be started. Default value: 128MB (Unsigned 64-bit integer) `compaction__filesystem__survival_mode_threshold` If the free space on the file system is above that threshold, the compaction is in 'normal mode'. Below that threshold, the compaction is in 'survival mode'. Each mode triggers the compaction process for different amount of uncompacted data found in the database. Default value: 2GB (Unsigned 64-bit integer) `compaction__filesystem__normal_batch_size` If the compaction is in normal mode and the amount of uncompacted data is above that value of `compaction__filesystem__normal_batch_size`, then the compaction will start when the compaction conditions are checked. Default value: 1GB (Unsigned 64-bit integer) `compaction__filesystem__survival_batch_size` If the compaction is in survival mode and the amount of uncompacted data is above that value of `compaction__filesystem__survival_batch_size`, then the compaction will start when the compaction conditions are checked. Default value: 256MB (Unsigned 64-bit integer) ================================================ FILE: doc/kingserver.md ================================================ Documentation of KingServer v0.9.0 ================================== ##Table of Contents **[1. What is KingServer?](#1-what-is-kingserver)** **[2. How to install KingServer](#2-how-to-install-kingserver)** **[3. How to run KingServer?](#4-how-to-run-kingserver)** **[4. Configuration files](#4-configuration-files)** **[5. Logging with Syslog](#5-logging-with-syslog)** **[6. Options](#6-options)** ##1. What is KingServer? KingServer is a server application that embeds KingDB and implements the Memcached protocol. It allows you to access your data through a network interface using whatever programming language you want. Use any Memcached client for the programming language that you want to use, point it to your KingServer instance, and start sending data in. It's really that simple! Note that the current version of KingServer, version 0.9.0, implements only a subset of the operations of the Memcached protocol, which are: GET, SET, and DELETE. If you want more details about this protocol, you can refer to the [Memcached protocol specifications](https://github.com/memcached/memcached/blob/master/doc/protocol.txt). In addition, keep in mind that KingServer is not a distributed system: the data lives in a single machine. You can backup your data simply by setting up a periodic rsync between the directory where your KingDB database is stored, and your backup machine. For more information about KingDB, check out the [KingDB documentation](kingdb.md). ##2. How to install KingServer? The installation process is the same as for KingDB. KingServer has no external dependencies and has been tested on: - Mac OS X 10.9.5 with Apple LLVM version 6.0 (clang-600.0.51) - Linux Ubuntu 14.04 x64 with GCC 4.9.2 - Linux Ubuntu 15.04 x64 with GCC 4.9.2-10ubuntu13 - Linux CentOS 6.5 x86\_64 with GCC 4.9.2 Because KingServer uses C++11, you need GCC >=4.9.2 or Clang >=3.3. The following commands will compile KingServer and will install the `kingserver` program. $ tar zxvf kingdb.tar.gz $ cd kingdb $ make $ sudo make install If you are using GCC, update the Makefile and add \-fno\-builtin\-memcmp in the CFLAGS, and if you have tcmalloc on your system, add \-ltcmalloc to the LDFLAGS. This will give you a nice performance speed\-up. ##3. How to run KingServer? To start a server, the only required parameter is the location where your want to store your data, specified by the `--db.path` parameter. The following command will run KingServer as a daemon process in background, which will listen to the port 11211 (default Memcached port), and will store the data in the path /tmp/mydb: $ kingserver --db.path=/tmp/mydb When you are done, you can stop the daemon by sending it a SIGTERM (15): $ pkill -f kingserver # will send SIGTERM to all processes whose name match 'kingserver' For rapid testing and prototyping, you can also prevent KingServer from running as a daemon, and run it in foreground. You can also redirect the logging to stderr to monitor what is going on inside: $ kingserver --foreground --log.target=stderr --db.path=/tmp/mydb 2015/03/29-13:49:33.687759 0x7fff78663310 KingServer Daemon has started When you are done, you can stop the daemon with CTRL+C: 2015/03/29-13:54:02.623910 0x7fff78663310 KingServer Received signal [2] 2015/03/29-13:54:02.627277 0x7fff78663310 KingServer Daemon has stopped ##4. Implementing a network client to reach KingServer The Memcached project keeps a [list of Memcached clients](https://code.google.com/p/memcached/wiki/Clients) for many programming languages. If you don't find your language in there, a simple Google search will find you a client. Imagine that your client code is in Python, and that you pick the Memcached package for Python called `pylibmc`. A test client program would look like this: import pylibmc client = pylibmc.Client(["127.0.0.1:11211"]) print "Setting 'key1' to 'value1'" client['key1'] = 'value1' print "Retrieving the value for 'key1':" + client['key1'] As long as your point the client to right IP and port where KingServer listens, you'll be able to access your data. ##4. Configuration files It would be tedious to have to specify all the options on the command line all the time. KingServer allows you to use a configuration file to set values for all options. Such a file would look like this: # hash can be used to add comments server.recv-socket-buffer-size 8kb server.interface.memcached-port 11211 server.num-threads 500 db.path /tmp/mydb db.write-buffer.size 256mb db.write-buffer.flush-timeout 5 seconds db.storage.hstable-size 512mb db.compaction.force-interval 30000ms For the data size parameters, such as `db.storage.hstable-size`: the default unit is the byte, but any other human-readable size units can also be specified. In the example configuration above, `db.storage.hstable-size` is set to 256mb, and the configuration manager of KingDB will convert that to bytes for you, and accepts both in lowercase or uppercase. The accepted size units are: b, byte, bytes, kb, mb, gb, tb, pb. For the time-related parameters, such as `db.write-buffer.flush-timeout`, the default unit is the millisecond, but any other human-readable time units can also be specified. The accepted time units are: ms, millisecond, milliseconds, s, second, seconds, minute, minutes, hour, hours. If the `--configfile` parameter is specified, KingServer will use the configuration file at that path. If no configuration file is specified, KingServer will look for one at the path `./kingdb.conf`, and `/etc/kingdb.conf`. If no file is found, the default values of all parameters will be used. $ kingserver --db.path /tmp/mydb --configfile /tmp/kingdb.conf For a complete list of all the available options, you can use the `--help` parameter: $ kingserver --help This list is reproduced below in the [Options section](#6-options). ##5. Logging with Syslog ###Selecting a log level All the logging goes through [Syslog](http://en.wikipedia.org/wiki/Syslog), a protocol for message logging on Unix-based operating systems. The logging modules of KingDB and KingServer use Syslog to log activity and errors, and let the Syslog server on the machine handle storage and log rotation. KingDB emits log messages with different priority levels, following most the priority levels of Syslog: - silent: all logging is turned off - emerg: system is unusable, imminent crash - alert: error event, immediate action required - crit: error event, immediate action required - error: error event, action is required but is not urgent - warn: events that can be harmful if no action is taken - notice: unusual events, but no immediate action required - info: normal operation events, no action required - debug: events used for debugging, no action required - trace: fine-grained events used for debugging, no action required ###Dedicated log file By default, the log message will go to /var/log/system.log. You can also configure Syslog to store the KingDB and KingServer log messages to a dedicated log file on the machine. Below are examples of how to configure a Ubuntu server or a Mac OS X system to log all the messages emitted by KingDB to a dedicated file at the path /var/log/kingdb.log. ####On Ubuntu: 1. Open the rsyslog configuration file: $ sudo vim /etc/rsyslog.d/50-default.conf 2. Add a filter at the top of the file: :syslogtag, contains, "kingdb" /var/log/kingdb.log & ~ 3. Restart rsyslog $ sudo service rsyslog restart ####On Mac OS X (using the [FreeBSD Syslog configuration](https://www.freebsd.org/doc/handbook/configtuning-syslog.html)): 1. Open the syslog configuration file: $ sudo vim /etc/syslog.conf 2. Add a filter at the top: !kingdb *.* /var/log/kingdb.log !* 3. Restart syslogd: $ sudo killall -HUP syslog syslogd 4. If the restart command above does not work, try this: $ sudo launchctl unload /System/Library/LaunchDaemons/com.apple.syslogd.plist $ sudo launchctl load /System/Library/LaunchDaemons/com.apple.syslogd.plist ##6. Options ###Server Options Options that alter the behavior of the KingServer network server. `--configfile` Configuration file. If not specified, the path ./kingdb.conf and /etc/kingdb.conf will be tested. Default value: ./kingdb.conf (String) `--foreground` When set, the server will run as a foreground process. By default, the server runs as a daemon process. Default value: not set (Flag) `--log.level` Level of the logging, can be: silent, emerg, alert, crit, error, warn, notice, info, debug, trace. Default value: info (String) `--log.target` Target of the logs, can be 'stderr' to log to stderr, or any custom string that will be used as the 'ident' parameter for syslog. Default value: kingdb (String) `--server.recv-socket-buffer-size` Size of the buffer used to receive data from the network. Each thread of the server has one such buffer. Default value: 64KB (Unsigned 64-bit integer) `--server.listen-backlog` Size of the listen() backlog. Default value: 150 (Unsigned 32-bit integer) `--server.num-threads` Num of threads in the pool of workers. Default value: 150 (Unsigned 32-bit integer) `--server.interface.memcached-port` Port where the memcached interface will listen. Default value: 11211 (Unsigned 32-bit integer) ###Database Options With the following options, you can change the behavior of the KingDB database embedded in the KingServer process that you are running. `--db.path` Path where the database can be found or will be created. This parameter is *mandatory* (String) `--db.create-if-missing` Will create the database if it does not already exists. Default value: True (Boolean) `--db.error-if-exists` Will exit if the database already exists. Default value: False (Boolean) `--db.incoming-rate-limit` Limit the rate of incoming traffic, in bytes per second. Unlimited if equal to 0. Default value: 0 (Unsigned 64-bit integer) `--db.write-buffer.size` Size of the Write Buffer. Default value: 64MB (Unsigned 64-bit integer) `--db.write-buffer.flush-timeout` The timeout after which the write buffer will flush its cache. Default value: 500 milliseconds (Unsigned 64-bit integer) `--db.write-buffer.mode` The mode with which the write buffer handles incoming traffic, can be 'direct' or 'adaptive'. With the 'direct' mode, once the Write Buffer is full other incoming Write and Delete operations will block until the buffer is persisted to secondary storage. The direct mode should be used when the clients are not subjects to timeouts. When choosing the 'adaptive' mode, incoming orders will be made slower, down to the speed of the writes on the secondary storage, so that they are almost just as fast as when using the direct mode, but are never blocking. The adaptive mode is expected to introduce a small performance decrease, but required for cases where clients timeouts must be avoided, for example when the database is used over a network. Default value: adaptive (String) `--db.storage.hstable-size` Maximum size a HSTable can have. Entries with keys and values beyond that size are considered to be large entries. Default value: 32MB (Unsigned 64-bit integer) `--db.storage.compression` Compression algorithm used by the storage engine. Can be 'disabled' or 'lz4'. Default value: lz4 (String) `--db.storage.hashing` Hashing algorithm used by the storage engine. Can be 'xxhash-64' or 'murmurhash3-64'. Default value: xxhash-64 (String) `--db.storage.minimum-free-space-accept-orders` Minimum free disk space required to accept incoming orders. It is recommended that for this value to be at least (2 x 'db.write-buffer.size' + 4 x 'db.storage.hstable-size'), so that when the file system fills up, the two write buffers can be flushed to secondary storage safely and the survival-mode compaction process can be run. Default value: 192MB (Unsigned 64-bit integer) `--db.storage.maximum-part-size` The maximum part size is used by the storage engine to split entries into smaller parts -- important for the compression and hashing algorithms, can never be more than (2^32 - 1) as the algorihms used do not support sizes above that value. Default value: 1MB (Unsigned 64-bit integer) `--db.storage.inactivity-streaming` The time of inactivity after which an entry stored with the streaming API is considered left for dead, and any subsequent incoming parts for that entry are rejected. Default value: 60 seconds (Unsigned 64-bit integer) `--db.storage.statistics-polling-interval` The frequency at which statistics are polled in the Storage Engine (free disk space, etc.). Default value: 5 seconds (Unsigned 64-bit integer) `--db.compaction.force-interval` Duration after which, if no compaction process has been performed, a compacted is started. Set to 0 to disable. Default value: 5 minutes (Unsigned 64-bit integer) `--db.compaction.filesystem.free-space-required` Minimum free space on the file system required for a compaction process to be started. Default value: 128MB (Unsigned 64-bit integer) `--db.compaction.filesystem.survival-mode-threshold` If the free space on the file system is above that threshold, the compaction is in 'normal mode'. Below that threshold, the compaction is in 'survival mode'. Each mode triggers the compaction process for different amount of uncompacted data found in the database. Default value: 2GB (Unsigned 64-bit integer) `--db.compaction.filesystem.normal-batch-size` If the compaction is in normal mode and the amount of uncompacted data is above that value of 'normal-batch-size', then the compaction will start when the compaction conditions are checked. Default value: 1GB (Unsigned 64-bit integer) `--db.compaction.filesystem.survival-batch-size` If the compaction is in survival mode and the amount of uncompacted data is above that value of 'survival-batch-size', then the compaction will start when the compaction conditions are checked. Default value: 256MB (Unsigned 64-bit integer) ================================================ FILE: include/kingdb/kdb.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_HEADERS_H_ #define KINGDB_HEADERS_H_ #include "util/byte_array.h" #include "util/status.h" #include "util/options.h" #include "interface/kingdb.h" #include "interface/database.h" #include "interface/snapshot.h" #include "interface/iterator.h" #include "interface/multipart.h" #endif // KINGDB_HEADERS_H_ ================================================ FILE: interface/database.cc ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #include "interface/database.h" namespace kdb { Status Database::GetRaw(ReadOptions& read_options, ByteArray& key, ByteArray* value_out, bool want_raw_data) { if (is_closed_) return Status::IOError("The database is not open"); log::trace("Database GetRaw()", "[%s]", key.ToString().c_str()); Status s = wb_->Get(read_options, key, value_out); if (s.IsDeleteOrder()) { return Status::NotFound("Unable to find entry"); } else if (s.IsNotFound()) { log::trace("Database GetRaw()", "not found in buffer"); s = se_->Get(read_options, key, value_out); if (s.IsNotFound()) { log::trace("Database GetRaw()", "not found in storage engine"); return s; } else if (s.IsOK()) { log::trace("Database GetRaw()", "found in storage engine"); } else { log::trace("Database GetRaw()", "unidentified error"); return s; } } else { log::trace("Database GetRaw()", "found in buffer"); } // TODO-36: There is technical debt here: // 1. The uncompression should be able to proceed without having to call a // Multipart Reader. // 2. The uncompression should be able to operate within a single buffer, and // not have to copy data into intermediate buffers through the Multipart // Reader as it is done here. Having intermediate buffers means that there // is more data copy than necessary, thus more time wasted /* log::trace("Database GetRaw()", "Before Multipart - want_raw_data:%d value_out->is_compressed():%d", want_raw_data, value_out->is_compressed()); if (want_raw_data == false && value_out->is_compressed()) { if (value_out->size() > db_options_.internal__size_multipart_required) { return Status::MultipartRequired(); } char* buffer = new char[value_out->size()]; uint64_t offset = 0; MultipartReader mp_reader(read_options, *value_out); for (mp_reader.Begin(); mp_reader.IsValid(); mp_reader.Next()) { ByteArray part; mp_reader.GetPart(&part); log::trace("Database GetRaw()", "Multipart loop size:%d [%s]", part.size(), part.ToString().c_str()); memcpy(buffer + offset, part.data(), part.size()); offset += part.size(); } *value_out = NewShallowCopyByteArray(buffer, value_out->size()); } */ if (want_raw_data == false && value_out->is_compressed()) { if (value_out->size() > db_options_.internal__size_multipart_required) { return Status::MultipartRequired(); } ByteArray value_out_uncompressed; compressor_.ResetThreadLocalStorage(); s = compressor_.UncompressByteArray(*value_out, read_options.verify_checksums, &value_out_uncompressed); //if (!s.IsOK()) { // fprintf(stderr, "Error in Get(): %s\n", s.ToString().c_str()); //} *value_out = value_out_uncompressed; } return s; } Status Database::Get(ReadOptions& read_options, ByteArray& key, ByteArray* value_out) { return GetRaw(read_options, key, value_out, false); } Status Database::Put(WriteOptions& write_options, ByteArray& key, ByteArray& chunk) { return PutPart(write_options, key, chunk, 0, chunk.size()); } Status Database::PutPart(WriteOptions& write_options, ByteArray& key, ByteArray& chunk, uint64_t offset_chunk, uint64_t size_value) { if (is_closed_) return Status::IOError("The database is not open"); if (offset_chunk + chunk.size() > size_value) { return Status::IOError("Attempted write beyond the total value size, aborting write."); } if (size_value <= db_options_.storage__maximum_part_size) { return PutPartValidSize(write_options, key, chunk, offset_chunk, size_value); } // 'chunk' may be deleted by the call to PutPartValidSize() // and therefore it cannot be used in the loop test condition uint64_t size_chunk = chunk.size(); Status s; for (uint64_t offset = 0; offset < size_chunk; offset += db_options_.storage__maximum_part_size) { ByteArray key_new, chunk_new; if (offset + db_options_.storage__maximum_part_size < chunk.size()) { chunk_new = chunk; chunk_new.set_offset(offset); chunk_new.set_size(db_options_.storage__maximum_part_size); key_new = key; } else { chunk_new = chunk; chunk_new.set_offset(offset); chunk_new.set_size(size_chunk - offset); key_new = key; } s = PutPartValidSize(write_options, key_new, chunk_new, offset_chunk + offset, size_value); if (!s.IsOK()) break; } return s; } Status Database::PutPartValidSize(WriteOptions& write_options, ByteArray& key, ByteArray& chunk, uint64_t offset_chunk, uint64_t size_value) { if (is_closed_) return Status::IOError("The database is not open"); Status s; s = se_->FileSystemStatus(); if (!s.IsOK()) return s; log::trace("Database::PutPartValidSize()", "[%s] size_chunk:%" PRIu64 " offset_chunk:%" PRIu64, key.ToString().c_str(), chunk.size(), offset_chunk); bool do_compression = true; uint64_t size_value_compressed = 0; uint64_t offset_chunk_compressed = offset_chunk; ByteArray chunk_final; bool is_first_part = (offset_chunk == 0); bool is_last_part = (chunk.size() + offset_chunk == size_value); log::trace("Database::PutPartValidSize()", "CompressionType:%d", db_options_.compression.type); if ( chunk.size() == 0 || db_options_.compression.type == kNoCompression) { do_compression = false; } if (is_first_part) { ts_compression_enabled_.put(1); ts_offset_.put(0); } if (ts_compression_enabled_.get() == 0) { // If compression is disabled, chunks are copied uncompressed, but the first // of the chunk copied when compression was disabled was shifted to have a // frame header, thus the current offset needs to account for it. //offset_chunk_compressed += compressor_.size_frame_header(); offset_chunk_compressed = ts_offset_.get(); ts_offset_.put(offset_chunk_compressed + chunk.size()); } if (!do_compression || ts_compression_enabled_.get() == 0) { chunk_final = chunk; } else { //std::chrono::high_resolution_clock::time_point step00 = std::chrono::high_resolution_clock::now(); if (is_first_part) { compressor_.ResetThreadLocalStorage(); } //std::chrono::high_resolution_clock::time_point step01 = std::chrono::high_resolution_clock::now(); offset_chunk_compressed = compressor_.size_compressed(); uint64_t size_compressed; char *compressed; s = compressor_.Compress(chunk.data(), chunk.size(), &compressed, &size_compressed); if (!s.IsOK()) return s; //std::chrono::high_resolution_clock::time_point step02 = std::chrono::high_resolution_clock::now(); log::trace("Database::PutPartValidSize()", "[%s] size_compressed:%" PRIu64, key.ToString().c_str(), compressor_.size_compressed()); // Now Checking if compression should be disabled for this entry uint64_t size_remaining = size_value - offset_chunk; uint64_t space_left = size_value + EntryHeader::CalculatePaddingSize(size_value) - offset_chunk_compressed; if ( size_remaining - chunk.size() + compressor_.size_frame_header() > space_left - size_compressed) { delete[] compressed; compressed = new char[compressor_.size_uncompressed_frame(chunk.size())]; compressor_.DisableCompressionInFrameHeader(compressed); memcpy(compressed + compressor_.size_frame_header(), chunk.data(), chunk.size()); compressor_.AdjustCompressedSize(- size_compressed); size_compressed = chunk.size() + compressor_.size_frame_header(); ts_compression_enabled_.put(0); ts_offset_.put(compressor_.size_compressed() + size_compressed); } //std::chrono::high_resolution_clock::time_point step03 = std::chrono::high_resolution_clock::now(); ByteArray chunk_compressed = NewShallowCopyByteArray(compressed, size_compressed); //std::chrono::high_resolution_clock::time_point step04 = std::chrono::high_resolution_clock::now(); log::trace("Database::PutPartValidSize()", "[%s] (%" PRIu64 ") compressed size %" PRIu64 " - offset_chunk_compressed %" PRIu64, key.ToString().c_str(), chunk.size(), chunk_compressed.size(), offset_chunk_compressed); chunk_final = chunk_compressed; /* std::chrono::high_resolution_clock::time_point step05 = std::chrono::high_resolution_clock::now(); uint64_t duration00 = std::chrono::duration_cast(step01 - step00).count(); uint64_t duration01 = std::chrono::duration_cast(step02 - step01).count(); uint64_t duration02 = std::chrono::duration_cast(step03 - step02).count(); uint64_t duration03 = std::chrono::duration_cast(step04 - step03).count(); uint64_t duration04 = std::chrono::duration_cast(step05 - step04).count(); log::info("Database::PutPartValidSize()", "Durations: [%" PRIu64 "] [%" PRIu64 "] [%" PRIu64 "] [%" PRIu64 "] [%" PRIu64 "]", duration00, duration01, duration02, duration03, duration04 ); */ } if (do_compression && is_last_part) { if (ts_compression_enabled_.get() == 1) { size_value_compressed = compressor_.size_compressed(); } else { if (is_first_part) { // chunk is self-contained: first ans last size_value_compressed = ts_offset_.get(); } else { size_value_compressed = offset_chunk_compressed + chunk.size(); } } } // Compute CRC32 checksum uint32_t crc32 = 0; if (is_first_part) { crc32_.ResetThreadLocalStorage(); crc32_.stream(key.data(), key.size()); } crc32_.stream(chunk_final.data(), chunk_final.size()); if (is_last_part) crc32 = crc32_.get(); log::trace("Database PutPartValidSize()", "[%s] size_value_compressed:%" PRIu64 " crc32:0x%" PRIx64 " END", key.ToString().c_str(), size_value_compressed, crc32); uint64_t size_padding = do_compression ? EntryHeader::CalculatePaddingSize(size_value) : 0; if ( offset_chunk_compressed + chunk_final.size() > size_value + size_padding) { log::emerg("Database::PutPartValidSize()", "Error: write was attempted outside of the allocated memory."); return Status::IOError("Prevented write to occur outside of the allocated memory."); } // (size_value_compressed != 0 && chunk->size() + offset_chunk == size_value_compressed)); return wb_->PutPart(write_options, key, chunk_final, offset_chunk_compressed, size_value, size_value_compressed, crc32); } Status Database::Delete(WriteOptions& write_options, ByteArray& key) { if (is_closed_) return Status::IOError("The database is not open"); log::trace("Database::Delete()", "[%s]", key.ToString().c_str()); Status s = se_->FileSystemStatus(); if (!s.IsOK()) return s; return wb_->Delete(write_options, key); } void Database::Flush() { wb_->Flush(); } void Database::Compact() { wb_->Flush(); se_->FlushCurrentFileForForcedCompaction(); se_->Compact(); } Snapshot Database::NewSnapshot() { if (is_closed_) return Snapshot(); log::trace("Database::NewSnapshot()", "start"); wb_->Flush(); uint32_t fileid_end = se_->FlushCurrentFileForSnapshot(); std::set* fileids_ignore; uint32_t snapshot_id; Status s = se_->GetNewSnapshotData(&snapshot_id, &fileids_ignore); if (!s.IsOK()) return Snapshot(); StorageEngine *se_readonly = new StorageEngine(db_options_, nullptr, dbname_, true, fileids_ignore, fileid_end); std::vector *fileids_iterator = se_readonly->GetFileidsIterator(); Snapshot snapshot(db_options_, dbname_, se_, se_readonly, fileids_iterator, snapshot_id); return snapshot; } KingDB* Database::NewSnapshotPointer() { if (is_closed_) return nullptr; log::trace("Database::NewSnapshotPointer()", "start"); wb_->Flush(); uint32_t fileid_end = se_->FlushCurrentFileForSnapshot(); std::set* fileids_ignore; uint32_t snapshot_id; Status s = se_->GetNewSnapshotData(&snapshot_id, &fileids_ignore); if (!s.IsOK()) return nullptr; StorageEngine *se_readonly = new StorageEngine(db_options_, nullptr, dbname_, true, fileids_ignore, fileid_end); std::vector *fileids_iterator = se_readonly->GetFileidsIterator(); Snapshot *snapshot = new Snapshot(db_options_, dbname_, se_, se_readonly, fileids_iterator, snapshot_id); return snapshot; } Iterator Database::NewIterator(ReadOptions& read_options) { if (is_closed_) return Iterator(); KingDB* snapshot = NewSnapshotPointer(); Iterator it = snapshot->NewIterator(read_options); //Iterator *si = static_cast(it); it.SetParentSnapshot(snapshot); return it; } } // namespace kdb ================================================ FILE: interface/database.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_INTERFACE_MAIN_H_ #define KINGDB_INTERFACE_MAIN_H_ #include "util/debug.h" #include #include #include #include #include #include #include #include #include "interface/kingdb.h" #include "cache/write_buffer.h" #include "storage/storage_engine.h" #include "storage/format.h" #include "util/status.h" #include "util/order.h" #include "util/byte_array.h" #include "util/options.h" #include "util/file.h" #include "interface/iterator.h" #include "interface/snapshot.h" #include "interface/multipart.h" #include "thread/threadstorage.h" #include "algorithm/compressor.h" #include "algorithm/crc32c.h" #include "algorithm/endian.h" namespace kdb { class Database: public KingDB { public: Database(const DatabaseOptions& db_options, const std::string& dbname) : db_options_(db_options), dbname_(FixDatabaseName(dbname)), is_closed_(true) { // Word-swapped endianness is not supported assert(getEndianness() == kBytesLittleEndian || getEndianness() == kBytesBigEndian); } Database(const std::string& dbname) : dbname_(FixDatabaseName(dbname)), is_closed_(true) { // Word-swapped endianness is not supported assert(getEndianness() == kBytesLittleEndian || getEndianness() == kBytesBigEndian); } virtual ~Database() { Close(); } std::string FixDatabaseName(const std::string& dbname) { // Allows both relative and absolute directory paths to work if (dbname.size() >= 1 && dbname[0] == '/') { return dbname; } else if (dbname.size() >= 2 && dbname[0] == '.' && dbname[1] == '/') { return FileUtil::kingdb_getcwd() + "/" + dbname.substr(2); } else { return FileUtil::kingdb_getcwd() + "/" + dbname; } } virtual Status Open() override { FileUtil::increase_limit_open_files(); Status s; struct stat info; bool db_exists = (stat(dbname_.c_str(), &info) == 0); if (db_exists && !(info.st_mode & S_IFDIR)) { return Status::IOError("A file with same name as the database already exists and is not a directory. Delete or rename this file to continue.", dbname_.c_str()); } if ( db_exists && db_options_.error_if_exists) { return Status::IOError("Could not create database directory", strerror(errno)); } if ( !db_exists && db_options_.create_if_missing && mkdir(dbname_.c_str(), 0755) < 0) { return Status::IOError("Could not create database directory", strerror(errno)); } std::string filepath_dboptions = DatabaseOptions::GetPath(dbname_); bool db_options_exists = (stat(filepath_dboptions.c_str(), &info) == 0); Status status_dboptions; DatabaseOptions db_options_candidate; if (db_options_exists) { // If there is a db_options file, try loading it log::trace("Database::Open()", "Loading db_options file"); if ((fd_dboptions_ = open(filepath_dboptions.c_str(), O_RDONLY, 0644)) < 0) { log::emerg("Database::Open()", "Could not open file [%s]: %s", filepath_dboptions.c_str(), strerror(errno)); } int ret = flock(fd_dboptions_, LOCK_EX | LOCK_NB); if (ret == EWOULDBLOCK || ret < 0) { close(fd_dboptions_); return Status::IOError("Could not acquire the global database lock: the database was already opened by another process"); } Mmap mmap(filepath_dboptions, info.st_size); if (!mmap.is_valid()) return Status::IOError("Mmap() constructor failed"); status_dboptions = DatabaseOptionEncoder::DecodeFrom(mmap.datafile(), mmap.filesize(), &db_options_candidate); if (status_dboptions.IsOK()) db_options_ = db_options_candidate; } if (db_exists && (!db_options_exists || !status_dboptions.IsOK())) { // The database already existed, but no db_options file was found in the // database directory, or the db_options file was present but invalid, // thus it needs to be recovered. DatabaseOptions db_options_candidate; std::string prefix_compaction = StorageEngine::GetCompactionFilePrefix(); Status s = HSTableManager::LoadDatabaseOptionsFromHSTables(dbname_, &db_options_candidate, prefix_compaction); if (s.IsOK()) db_options_ = db_options_candidate; } if (!db_exists || !db_options_exists || !status_dboptions.IsOK()) { // If there is no db_options file, or if it's invalid, write it log::trace("Database::Open()", "Writing db_options file"); if ((fd_dboptions_ = open(filepath_dboptions.c_str(), O_WRONLY|O_CREAT, 0644)) < 0) { log::emerg("Database::Open()", "Could not open file [%s]: %s", filepath_dboptions.c_str(), strerror(errno)); } char buffer[DatabaseOptionEncoder::GetFixedSize()]; DatabaseOptionEncoder::EncodeTo(&db_options_, buffer); if (write(fd_dboptions_, buffer, DatabaseOptionEncoder::GetFixedSize()) < 0) { close(fd_dboptions_); return Status::IOError("Could not write 'db_options' file", strerror(errno)); } } Hash* hash = MakeHash(db_options_.hash); uint64_t max_size_hash = hash->MaxInputSize(); delete hash; if (db_options_.storage__maximum_part_size > std::numeric_limits::max()) { return Status::IOError("db.storage.maximum-part-size cannot be greater than max int32. Fix your options."); } if (db_options_.storage__maximum_part_size >= db_options_.storage__hstable_size) { return Status::IOError("The maximum size of a chunk cannot be larger than the minimum size of a large file (db.storage.maximum-part-size >= db.storage.hstable-size). Fix your options."); } if (db_options_.storage__maximum_part_size > max_size_hash) { return Status::IOError("db.storage.maximum-part-size cannot be greater than the maximum input size of the hash function you chose. Fix your options."); } if ( db_options_.compression.type != kNoCompression && db_options_.storage__maximum_part_size > compressor_.MaxInputSize()) { return Status::IOError("db.storage.maximum-part-size cannot be greater than the maximum input size of the compression function you chose. Fix your options."); } std::unique_lock lock(mutex_close_); if (!is_closed_) return Status::IOError("The database is already open"); em_ = new EventManager(); wb_ = new WriteBuffer(db_options_, em_); se_ = new StorageEngine(db_options_, em_, dbname_); is_closed_ = false; return Status::OK(); } virtual void Close() override { std::unique_lock lock(mutex_close_); if (is_closed_) return; flock(fd_dboptions_, LOCK_UN); close(fd_dboptions_); is_closed_ = true; wb_->Close(); se_->Close(); delete wb_; delete se_; delete em_; } // TODO: make sure that if an entry cannot be returned because memory cannot // be allocated, a proper error message is returned -- same for the Iterator // and Snapshot virtual Status Get(ReadOptions& read_options, ByteArray& key, ByteArray* value_out) override; virtual Status Get(ReadOptions& read_options, ByteArray& key, std::string* value_out) { return KingDB::Get(read_options, key, value_out); } virtual Status Get(ReadOptions& read_options, const std::string& key, ByteArray* value_out) { return KingDB::Get(read_options, key, value_out); } virtual Status Get(ReadOptions& read_options, const std::string& key, std::string* value_out) { return KingDB::Get(read_options, key, value_out); } virtual Status Put(WriteOptions& write_options, ByteArray& key, ByteArray& value) override; virtual Status Put(WriteOptions& write_options, ByteArray& key, const std::string& chunk) { return KingDB::Put(write_options, key, chunk); } virtual Status Put(WriteOptions& write_options, const std::string& key, ByteArray& chunk) { return KingDB::Put(write_options, key, chunk); } virtual Status Put(WriteOptions& write_options, const std::string& key, const std::string& chunk) { return KingDB::Put(write_options, key, chunk); } virtual Status PutPart(WriteOptions& write_options, ByteArray& key, ByteArray& chunk, uint64_t offset_chunk, // TODO: could the offset be handled by the method itself? uint64_t size_value) override; virtual Status Delete(WriteOptions& write_options, ByteArray& key) override; virtual Snapshot NewSnapshot(); virtual Iterator NewIterator(ReadOptions& read_options) override; virtual MultipartReader NewMultipartReader(ReadOptions& read_options, ByteArray& key) { ByteArray value; Status s = GetRaw(read_options, key, &value, true); if (!s.IsOK()) { return MultipartReader(s); } else { return MultipartReader(read_options, value); } } virtual MultipartReader NewMultipartReader(ReadOptions& read_options, const std::string& key_str) { ByteArray key = NewDeepCopyByteArray(key_str); return NewMultipartReader(read_options, key); } MultipartWriter NewMultipartWriter(WriteOptions& write_options, ByteArray& key, uint64_t size_value_total) { return MultipartWriter(this, write_options, key, size_value_total); } MultipartWriter NewMultipartWriter(WriteOptions& write_options, const std::string& key_str, uint64_t size_value_total) { ByteArray key = NewDeepCopyByteArray(key_str); return MultipartWriter(this, write_options, key, size_value_total); } virtual void Flush(); virtual void Compact(); private: KingDB* NewSnapshotPointer(); Status GetRaw(ReadOptions& read_options, ByteArray& key, ByteArray* value_out, bool want_raw_data); Status PutPartValidSize(WriteOptions& write_options, ByteArray& key, ByteArray& chunk, uint64_t offset_chunk, uint64_t size_value); kdb::DatabaseOptions db_options_; std::string dbname_; kdb::WriteBuffer *wb_; kdb::StorageEngine *se_; kdb::EventManager *em_; kdb::CompressorLZ4 compressor_; kdb::CRC32 crc32_; ThreadStorage ts_compression_enabled_; ThreadStorage ts_offset_; bool is_closed_; int fd_dboptions_; std::mutex mutex_close_; }; } // namespace kdb #endif // KINGDB_INTERFACE_MAIN_H_ ================================================ FILE: interface/iterator.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_ITERATOR_MAIN_H_ #define KINGDB_ITERATOR_MAIN_H_ #include #include "util/status.h" #include "util/order.h" #include "util/byte_array.h" #include "util/options.h" #include "util/file.h" #include "interface/kingdb.h" #include "interface/multipart.h" #include "storage/storage_engine.h" namespace kdb { class IteratorResource { public: virtual ~IteratorResource() {}; virtual void Close() = 0; virtual void SetParentSnapshot(KingDB *snapshot) = 0; virtual void Begin() = 0; virtual bool IsValid() = 0; virtual bool Next() = 0; virtual ByteArray GetKey() = 0; virtual ByteArray GetValue() = 0; virtual MultipartReader GetMultipartValue() = 0; virtual Status GetStatus() = 0; }; class RegularIterator: public IteratorResource { public: RegularIterator() : is_closed_(true), se_readonly_(nullptr), snapshot_(nullptr), status_(Status::IOError("Invalid iterator")) { } RegularIterator(ReadOptions& read_options, StorageEngine *se_readonly, std::vector* fileids_iterator) : is_closed_(false), se_readonly_(se_readonly), read_options_(read_options), snapshot_(nullptr), fileids_iterator_(fileids_iterator), status_(Status::OK()) { log::trace("RegularIterator::ctor()", "start"); log::trace("RegularIterator::ctor()", "fileids_iterator_->size():%u", fileids_iterator_->size()); } ~RegularIterator() { Close(); } void Close() { std::unique_lock lock(mutex_); if (!is_closed_) { is_closed_ = true; if (snapshot_ != nullptr) { delete snapshot_; snapshot_ = nullptr; } } } RegularIterator(RegularIterator&& it) : mutex_() { log::trace("RegularIterator::move-ctor()", "start"); this->se_readonly_ = it.se_readonly_; this->read_options_ = it.read_options_; this->snapshot_ = it.snapshot_; this->fileids_iterator_ = it.fileids_iterator_; this->is_closed_ = it.is_closed_; it.snapshot_ = nullptr; } void SetParentSnapshot(KingDB *snapshot) { snapshot_ = snapshot; } void Begin() { log::trace("RegularIterator::Begin()", "start"); if (se_readonly_ == nullptr) { is_valid_ = false; return; } mutex_.lock(); fileid_current_ = 0; has_file_ = false; index_fileid_ = 0; is_valid_ = true; mutex_.unlock(); Next(); log::trace("RegularIterator::Begin()", "end"); } bool IsValid() { log::trace("RegularIterator::IsValid()", "start"); std::unique_lock lock(mutex_); log::trace("RegularIterator::IsValid()", "end"); return is_valid_; } bool Next() { log::trace("RegularIterator::Next()", "start"); std::unique_lock lock(mutex_); if (!is_valid_) return false; status_ = Status::OK(); Status s; while (true) { log::trace("RegularIterator::Next()", "loop index_file:[%u] index_location:[%u]", index_fileid_, index_location_); if (index_fileid_ >= fileids_iterator_->size()) { log::trace("RegularIterator::Next()", "invalid index_fileid_:[%u] fileids_iterator_->size():[%u]", index_fileid_, fileids_iterator_->size()); is_valid_ = false; break; } if (!has_file_) { log::trace("RegularIterator::Next()", "initialize file"); fileid_current_ = fileids_iterator_->at(index_fileid_); filepath_current_ = se_readonly_->GetFilepath(fileid_current_); struct stat info; if (stat(filepath_current_.c_str(), &info) != 0) { index_fileid_ += 1; continue; } Mmap mmap(filepath_current_.c_str(), info.st_size); if (!mmap.is_valid()) break; uint64_t dummy_filesize; bool dummy_is_file_large; std::multimap index_temp; s = HSTableManager::LoadFile(mmap, fileid_current_, index_temp, &dummy_filesize, &dummy_is_file_large); if (!s.IsOK()) { index_fileid_ += 1; continue; } locations_current_.clear(); for (auto& p: index_temp) { locations_current_.push_back(p.second); } std::sort(locations_current_.begin(), locations_current_.end()); index_location_ = 0; has_file_ = true; } log::trace("RegularIterator::Next()", "has file"); if (index_location_ >= locations_current_.size()) { log::trace("RegularIterator::Next()", "index_location_ is out"); has_file_ = false; index_fileid_ += 1; continue; } // Get entry at the location ByteArray key, value; uint64_t location_current = locations_current_[index_location_]; Status s = se_readonly_->GetEntry(read_options_, location_current, &key, &value); if (!s.IsOK()) { log::trace("RegularIterator::Next()", "GetEntry() failed: %s", s.ToString().c_str()); index_location_ += 1; continue; } // Get entry for the key found at the location, and continue if the // location is a mismatch -- i.e. the current entry has been overwritten // by a later entry. bool is_last = se_readonly_->IsLocationLastInIndex(location_current, key); if (!is_last) { //fprintf(stderr, "was not last, need to check\n"); ByteArray value_alt; uint64_t location_out; s = se_readonly_->Get(read_options_, key, &value_alt, &location_out); if (!s.IsOK()) { log::trace("RegularIterator::Next()", "Get(): failed: %s", s.ToString().c_str()); index_fileid_ += 1; continue; } if (location_current != location_out) { log::trace("RegularIterator::Next()", "Get(): wrong location - 0x%08" PRIx64 " - 0x%08" PRIx64, location_current, location_out); index_location_ += 1; continue; } } log::trace("RegularIterator::Next()", "has a valid key/value pair"); key_ = key; value_ = value; index_location_ += 1; if (value_.size() > se_readonly_->db_options_.internal__size_multipart_required) { status_ = Status::MultipartRequired(); } //index_location_ += 1; return true; } return false; } ByteArray GetKey() { std::unique_lock lock(mutex_); return key_; } ByteArray GetValue() { std::unique_lock lock(mutex_); if (!value_.is_compressed()) return value_; if (value_.size() > se_readonly_->db_options_.internal__size_multipart_required) { return ByteArray(); } // TODO-36: Uncompression should not have to go through a MultipartReader. See // the notes about this TODO in kingdb.cc. char* buffer = new char[value_.size()]; uint64_t offset = 0; MultipartReader mp_reader(read_options_, value_); for (mp_reader.Begin(); mp_reader.IsValid(); mp_reader.Next()) { ByteArray part; mp_reader.GetPart(&part); log::trace("ByteArray::GetValue()", "Multipart loop size:%d [%s]", part.size(), part.ToString().c_str()); memcpy(buffer + offset, part.data(), part.size()); offset += part.size(); } status_ = mp_reader.GetStatus(); if (!status_.IsOK()) log::trace("ByteArray::GetValue()", "Error in GetValue(): %s\n", status_.ToString().c_str()); return NewShallowCopyByteArray(buffer, value_.size()); } MultipartReader GetMultipartValue() { return MultipartReader(read_options_, value_); } Status GetStatus() { std::unique_lock lock(mutex_); return status_; } private: bool is_closed_; StorageEngine *se_readonly_; ReadOptions read_options_; KingDB* snapshot_; std::mutex mutex_; uint32_t fileid_current_; std::string filepath_current_; uint32_t index_fileid_; std::vector* fileids_iterator_; uint32_t index_location_; std::vector locations_current_; bool has_file_; bool is_valid_; Status status_; ByteArray key_; ByteArray value_; }; class SequentialIterator: public IteratorResource { public: SequentialIterator() : is_closed_(true), se_readonly_(nullptr), snapshot_(nullptr), status_(Status::IOError("Invalid iterator")) { } SequentialIterator(ReadOptions& read_options, StorageEngine *se_readonly, std::vector* fileids_iterator) : is_closed_(false), se_readonly_(se_readonly), read_options_(read_options), snapshot_(nullptr), fileids_iterator_(fileids_iterator), status_(Status::OK()) { log::trace("SequentialIterator::ctor()", "start"); log::trace("SequentialIterator::ctor()", "fileids_iterator_->size():%u", fileids_iterator_->size()); } ~SequentialIterator() { Close(); } void Close() { std::unique_lock lock(mutex_); if (!is_closed_) { is_closed_ = true; if (snapshot_ != nullptr) { delete snapshot_; snapshot_ = nullptr; } } } SequentialIterator(SequentialIterator&& it) : mutex_() { log::trace("SequentialIterator::move-ctor()", "start"); this->se_readonly_ = it.se_readonly_; this->read_options_ = it.read_options_; this->snapshot_ = it.snapshot_; this->fileids_iterator_ = it.fileids_iterator_; this->is_closed_ = it.is_closed_; it.snapshot_ = nullptr; } void SetParentSnapshot(KingDB *snapshot) { snapshot_ = snapshot; } void Begin() { log::trace("SequentialIterator::Begin()", "start"); if (se_readonly_ == nullptr) { is_valid_ = false; return; } mutex_.lock(); fileid_current_ = 0; has_file_ = false; index_fileid_ = 0; is_valid_ = true; mutex_.unlock(); Next(); log::trace("SequentialIterator::Begin()", "end"); } bool IsValid() { log::trace("SequentialIterator::IsValid()", "start"); std::unique_lock lock(mutex_); log::trace("SequentialIterator::IsValid()", "end"); return is_valid_; } bool Next() { log::trace("SequentialIterator::Next()", "start"); std::unique_lock lock(mutex_); if (!is_valid_) return false; status_ = Status::OK(); Status s; while (true) { log::trace("SequentialIterator::Next()", "loop index_file:[%u] index_location:[%u]", index_fileid_, index_location_); if (index_fileid_ >= fileids_iterator_->size()) { log::trace("SequentialIterator::Next()", "invalid index_fileid_:[%u] fileids_iterator_->size():[%u]", index_fileid_, fileids_iterator_->size()); is_valid_ = false; break; } if (has_file_ && offset_ >= offset_end_) { //index_fileid_ += 1; //has_file_ = false; //continue; } if (!has_file_) { log::trace("SequentialIterator::Next()", "initialize file"); fileid_current_ = fileids_iterator_->at(index_fileid_); filepath_current_ = se_readonly_->GetFilepath(fileid_current_); offset_ = se_readonly_->db_options_.internal__hstable_header_size; struct stat info; if (stat(filepath_current_.c_str(), &info) != 0) { index_fileid_ += 1; continue; } mmap_.Open(filepath_current_, info.st_size); if (!mmap_.is_valid()) break; index_location_ = 0; has_file_ = true; struct HSTableFooter footer; s = HSTableFooter::DecodeFrom(mmap_.datafile() + mmap_.filesize() - HSTableFooter::GetFixedSize(), HSTableFooter::GetFixedSize(), &footer); offset_end_ = footer.offset_indexes; } struct EntryHeader entry_header; uint32_t size_header; Status s = EntryHeader::DecodeFrom(se_readonly_->db_options_, read_options_, mmap_.datafile() + offset_, mmap_.filesize() - offset_, &entry_header, &size_header); if ( !s.IsOK() || !entry_header.AreSizesValid(offset_, mmap_.filesize())) { // End of file during recovery, thus breaking out of the while-loop mmap_.Close(); has_file_ = false; index_fileid_ += 1; continue; } //ByteArray key = NewMmappedByteArray(filepath, mmap_.filesize()); ByteArray key = ByteArray::NewPooledByteArray(se_readonly_->file_manager_, fileid_current_, filepath_current_, mmap_.filesize_); ByteArray value = key; if (read_options_.verify_checksums) { uint32_t checksum_key = crc32c::Value(value.data() + offset_ + size_header, entry_header.size_key); value.set_checksum_initial(checksum_key); } key.set_offset(offset_ + size_header); key.set_size(entry_header.size_key); value.set_offset(offset_ + size_header + entry_header.size_key); value.set_size(entry_header.size_value); value.set_size_compressed(entry_header.size_value_compressed); value.set_checksum(entry_header.checksum_content); offset_ += size_header + entry_header.size_key + entry_header.size_value_offset(); log::trace("SequentialIterator::Next()", "has a valid key/value pair"); key_ = key; value_ = value; index_location_ += 1; if (value_.size() > se_readonly_->db_options_.internal__size_multipart_required) { status_ = Status::MultipartRequired(); } return true; } return false; } ByteArray GetKey() { std::unique_lock lock(mutex_); return key_; } ByteArray GetValue() { std::unique_lock lock(mutex_); if (!value_.is_compressed()) return value_; if (value_.size() > se_readonly_->db_options_.internal__size_multipart_required) { return ByteArray(); } // TODO-36: Uncompression should not have to go through a MultipartReader. See // the notes about this TODO in kingdb.cc. char* buffer = new char[value_.size()]; uint64_t offset = 0; MultipartReader mp_reader(read_options_, value_); for (mp_reader.Begin(); mp_reader.IsValid(); mp_reader.Next()) { ByteArray part; mp_reader.GetPart(&part); log::trace("ByteArray::GetValue()", "Multipart loop size:%d [%s]", part.size(), part.ToString().c_str()); memcpy(buffer + offset, part.data(), part.size()); offset += part.size(); } status_ = mp_reader.GetStatus(); if (!status_.IsOK()) log::trace("ByteArray::GetValue()", "Error in GetValue(): %s\n", status_.ToString().c_str()); return NewShallowCopyByteArray(buffer, value_.size()); } MultipartReader GetMultipartValue() { return MultipartReader(read_options_, value_); } Status GetStatus() { std::unique_lock lock(mutex_); return status_; } private: bool is_closed_; StorageEngine *se_readonly_; ReadOptions read_options_; KingDB* snapshot_; std::mutex mutex_; uint32_t fileid_current_; std::string filepath_current_; uint32_t offset_; uint32_t offset_end_; uint32_t index_fileid_; std::vector* fileids_iterator_; uint32_t index_location_; std::vector locations_current_; bool has_file_; bool is_valid_; Status status_; Mmap mmap_; ByteArray key_; ByteArray value_; }; class Iterator { friend class Snapshot; public: Iterator(): resource_(nullptr) {} ~Iterator() { if (resource_ != nullptr) { delete resource_; resource_ = nullptr; } } Iterator(Iterator&& ir) { this->resource_ = ir.resource_; ir.resource_ = nullptr; } void Close() { resource_->Close(); } void SetParentSnapshot(KingDB *snapshot) { resource_->SetParentSnapshot(snapshot); } void Begin() { resource_->Begin(); } bool IsValid() { return resource_->IsValid(); } bool Next() { return resource_->Next(); } ByteArray GetKey() { return resource_->GetKey(); } ByteArray GetValue() { return resource_->GetValue(); } MultipartReader GetMultipartValue() { return resource_->GetMultipartValue(); } Status GetStatus() { return resource_->GetStatus(); } bool _DEBUGGING_IsSequential() { // Warning: for unit tests only, do not use this method. if (dynamic_cast(resource_)) { return true; } else { return false; } } private: IteratorResource* resource_; void SetIteratorResource(IteratorResource* resource) { resource_ = resource; } }; } // end namespace kdb #endif // KINGDB_ITERATOR_MAIN_H_ ================================================ FILE: interface/kingdb.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_INTERFACE_H_ #define KINGDB_INTERFACE_H_ #include "util/options.h" #include "util/status.h" #include "util/order.h" #include "util/byte_array.h" namespace kdb { class MultipartReader; class Iterator; // KingDB is an abstract class that serves as an interface for the two main classes // that allow access to a KingDB database: Database and Snapshot. class KingDB { friend class MultipartWriter; public: virtual ~KingDB() {} virtual Status Get(ReadOptions& read_options, ByteArray& key, ByteArray* value_out) = 0; virtual Status Get(ReadOptions& read_options, ByteArray& key, std::string* value_out) { ByteArray value; Status s = Get(read_options, key, &value); if (!s.IsOK()) return s; *value_out = value.ToString(); return s; } virtual Status Get(ReadOptions& read_options, const std::string& key, ByteArray* value_out) { ByteArray byte_array_key = NewPointerByteArray(key.c_str(), key.size()); Status s = Get(read_options, byte_array_key, value_out); return s; } virtual Status Get(ReadOptions& read_options, const std::string& key, std::string* value_out) { ByteArray byte_array_key = NewPointerByteArray(key.c_str(), key.size()); ByteArray value; Status s = Get(read_options, key, &value); if (!s.IsOK()) return s; *value_out = value.ToString(); return s; } virtual Status Put(WriteOptions& write_options, ByteArray& key, ByteArray& chunk) = 0; virtual Status Put(WriteOptions& write_options, ByteArray& key, const std::string& chunk) { ByteArray byte_array_chunk = NewDeepCopyByteArray(chunk.c_str(), chunk.size()); return Put(write_options, key, byte_array_chunk); } virtual Status Put(WriteOptions& write_options, const std::string& key, ByteArray& chunk) { ByteArray byte_array_key = NewDeepCopyByteArray(key.c_str(), key.size()); return Put(write_options, byte_array_key, chunk); } virtual Status Put(WriteOptions& write_options, const std::string& key, const std::string& chunk) { ByteArray byte_array_key = NewDeepCopyByteArray(key.c_str(), key.size()); ByteArray byte_array_chunk = NewDeepCopyByteArray(chunk.c_str(), chunk.size()); return Put(write_options, byte_array_key, byte_array_chunk); } virtual MultipartReader NewMultipartReader(ReadOptions& read_options, ByteArray& key) = 0; virtual Status Delete(WriteOptions& write_options, ByteArray& key) = 0; virtual Iterator NewIterator(ReadOptions& read_options) = 0; virtual Status Open() = 0; virtual void Close() = 0; virtual void Flush() = 0; virtual void Compact() = 0; private: virtual Status PutPart(WriteOptions& write_options, ByteArray& key, ByteArray& chunk, uint64_t offset_chunk, uint64_t size_value) = 0; }; } // namespace kdb #endif // KINGDB_INTERFACE_H_ ================================================ FILE: interface/multipart.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_MULTIPART_H_ #define KINGDB_MULTIPART_H_ #include "util/debug.h" #include #include #include #include #include #include #include #include #include #include "util/logger.h" #include "util/options.h" #include "util/byte_array.h" #include "algorithm/compressor.h" #include "algorithm/crc32c.h" #include "interface/kingdb.h" namespace kdb { class MultipartReader { friend class KingDB; friend class Snapshot; friend class Database; friend class RegularIterator; friend class SequentialIterator; public: ~MultipartReader() {} virtual void Begin() { log::trace("MultipartReader::Next()", "Begin()"); if (read_options_.verify_checksums) { crc32_.ResetThreadLocalStorage(); crc32_.put(value_.checksum_initial()); } is_valid_stream_ = true; is_compression_disabled_ = false; offset_output_ = 0; compressor_.ResetThreadLocalStorage(); status_ = Status::IOError("Stream is unfinished"); Next(); } virtual bool IsValid() { return is_valid_stream_; } virtual Status GetStatus() { log::trace("MultipartReader::GetStatus()", ""); return status_; } // Careful here: if the call to Next() is the first one, i.e. the one in // Begin(), then is_valid_stream_ must not be set to false yet, otherwise // the for-loops of type for(Begin(); IsValid(); Next()) would never run, // as IsValid() would prevent the first iteration to start. virtual bool Next() { if (is_compressed() && !is_compression_disabled_) { if (compressor_.IsUncompressionDone(value_.size_compressed())) { is_valid_stream_ = false; if ( !read_options_.verify_checksums || crc32_.get() == value_.checksum()) { log::debug("MultipartReader::Next()", "Good CRC32 - stored:0x%08" PRIx64 " computed:0x%08" PRIx64 "\n", value_.checksum(), crc32_.get()); status_ = Status::OK(); } else { log::debug("MultipartReader::Next()", "Bad CRC32 - stored:0x%08" PRIx64 " computed:0x%08" PRIx64 "\n", value_.checksum(), crc32_.get()); status_ = Status::IOError("Invalid checksum."); } return true; } if (compressor_.HasFrameHeaderDisabledCompression(value_.data() + offset_output_)) { log::debug("MultipartReader::Next()", "Finds that compression is disabled\n"); is_compression_disabled_ = true; if (read_options_.verify_checksums) { crc32_.stream(value_.data() + offset_output_, compressor_.size_frame_header()); } offset_output_ += compressor_.size_frame_header(); } if (!is_compression_disabled_) { char *frame; uint64_t size_frame; char *data_out; uint64_t size_out; log::trace("MultipartReader::Next()", "before uncompress"); Status s = compressor_.Uncompress(value_.data(), value_.size_compressed(), &data_out, &size_out, &frame, &size_frame); offset_output_ += size_frame; chunk_ = NewShallowCopyByteArray(data_out, size_out); if (s.IsDone()) { is_valid_stream_ = false; status_ = Status::OK(); } else if (s.IsOK()) { if (read_options_.verify_checksums) { crc32_.stream(frame, size_frame); } } else { is_valid_stream_ = false; status_ = s; } } } if (!value_.is_compressed() || is_compression_disabled_) { log::trace("MultipartReader::Next()", "No compression or compression disabled"); uint64_t size_left; if (value_.is_compressed() && is_compression_disabled_) { size_left = value_.size_compressed(); } else { size_left = value_.size(); } if (offset_output_ == size_left) { log::trace("MultipartReader::Next()", "Has gotten all the data"); is_valid_stream_ = false; status_ = Status::OK(); return true; } char* data_left = value_.data() + offset_output_; size_t step = 1024*1024; // TODO: make this a parameter at some point? size_t size_current = offset_output_ + step < size_left ? step : size_left - offset_output_; if (read_options_.verify_checksums) { crc32_.stream(data_left, size_current); } chunk_ = value_; chunk_.increment_offset(offset_output_); chunk_.set_size(size_current); chunk_.set_size_compressed(0); offset_output_ += size_current; status_ = Status::OK(); log::trace("MultipartReader::Next()", "Done with handling uncompressed data - Status:%s", status_.ToString().c_str()); } return true; } virtual Status GetPart(ByteArray* part) { log::trace("MultipartReader::Next()", "GetPart() - Status:%s", status_.ToString().c_str()); *part = chunk_; return status_; } CompressorLZ4 compressor_; CRC32 crc32_; uint64_t offset_output_; bool is_compression_disabled_; Status status_; ByteArray chunk_; bool is_valid_stream_; bool is_compressed() { return value_.is_compressed(); } MultipartReader(const MultipartReader& r) { if(&r != this) { this->read_options_ = r.read_options_; this->value_ = r.value_; } } uint64_t size() { return value_.size(); } private: MultipartReader(Status s) : status_(s) { } MultipartReader(ReadOptions& read_options, ByteArray& value) : status_(Status::OK()), read_options_(read_options), value_(value) { } ReadOptions read_options_; ByteArray value_; }; class MultipartWriter { friend class Database; public: ~MultipartWriter() {} Status PutPart(ByteArray& part) { Status s = db_->PutPart(write_options_, key_, part, offset_, size_value_total_); if (s.IsOK()) offset_ += part.size(); return s; } private: MultipartWriter(KingDB* db, WriteOptions& write_options, ByteArray& key, uint64_t size_value_total) : db_(db), write_options_(write_options), key_(key), size_value_total_(size_value_total), offset_(0) { } KingDB* db_; WriteOptions write_options_; ByteArray key_; uint64_t size_value_total_; uint64_t offset_; }; } // namespace kdb #endif // KINGDB_MULTIPART_H_ ================================================ FILE: interface/snapshot.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_SNAPSHOT_MAIN_H_ #define KINGDB_SNAPSHOT_MAIN_H_ #include #include "util/status.h" #include "interface/iterator.h" #include "interface/kingdb.h" #include "interface/multipart.h" #include "util/order.h" #include "util/byte_array.h" #include "util/options.h" namespace kdb { class Snapshot: public KingDB { public: Snapshot() : se_live_(nullptr), se_readonly_(nullptr), fileids_iterator_(nullptr), is_closed_(true) { } Snapshot(const DatabaseOptions& db_options, const std::string dbname, StorageEngine *se_live, StorageEngine *se_readonly, std::vector* fileids_iterator, uint32_t snapshot_id) : db_options_(db_options), dbname_(dbname), se_live_(se_live), se_readonly_(se_readonly), snapshot_id_(snapshot_id), fileids_iterator_(fileids_iterator), is_closed_(false) { } Snapshot(Snapshot&& s) : mutex_close_() { this->db_options_ = s.db_options_; this->dbname_ = s.dbname_; this->se_live_ = s.se_live_; this->se_readonly_ = s.se_readonly_; this->snapshot_id_ = s.snapshot_id_; this->fileids_iterator_ = s.fileids_iterator_; this->is_closed_ = s.is_closed_; s.fileids_iterator_ = nullptr; s.se_readonly_ = nullptr; } virtual ~Snapshot() { log::trace("Snapshot::dtor()", ""); Close(); } virtual Status Open() override { return Status::OK(); } virtual void Close() override { log::trace("Snapshot::Close()", "start"); std::unique_lock lock(mutex_close_); if (is_closed_) return; is_closed_ = true; delete fileids_iterator_; se_live_->ReleaseSnapshot(snapshot_id_); delete se_readonly_; log::trace("Snapshot::Close()", "end"); } virtual Status Get(ReadOptions& read_options, ByteArray& key, ByteArray* value_out) override { Status s = se_readonly_->Get(read_options, key, value_out); if (s.IsNotFound()) { log::trace("Snapshot::Get()", "not found in storage engine"); return s; } else if (s.IsOK()) { log::trace("Snapshot::Get()", "found in storage engine"); return s; } else { log::trace("Snapshot::Get()", "unidentified error"); return s; } return s; } virtual Status Put(WriteOptions& write_options, ByteArray& key, ByteArray& chunk) override { return Status::IOError("Not supported"); } virtual Status PutPart(WriteOptions& write_options, ByteArray& key, ByteArray& chunk, uint64_t offset_chunk, uint64_t size_value) override { return Status::IOError("Not supported"); } virtual Status Delete(WriteOptions& write_options, ByteArray& key) override { return Status::IOError("Not supported"); } virtual Iterator NewIterator(ReadOptions& read_options) override { IteratorResource* ir = nullptr; uint64_t dbsize_uncompacted = se_readonly_->GetDbSizeUncompacted(); if (dbsize_uncompacted > 0) { ir = new RegularIterator(read_options, se_readonly_, fileids_iterator_); } else { ir = new SequentialIterator(read_options, se_readonly_, fileids_iterator_); } Iterator it; it.SetIteratorResource(ir); return it; } virtual MultipartReader NewMultipartReader(ReadOptions& read_options, ByteArray& key) { ByteArray value; Status s = GetRaw(read_options, key, &value, true); if (!s.IsOK()) { return MultipartReader(s); } else { return MultipartReader(read_options, value); } } virtual void Flush() {} virtual void Compact() {} private: Status GetRaw(ReadOptions& read_options, ByteArray& key, ByteArray* value_out, bool want_raw_data) { // WARNING: code duplication with Database::GetRaw() if (is_closed_) return Status::IOError("The database is not open"); log::trace("Database GetRaw()", "[%s]", key.ToString().c_str()); Status s = se_readonly_->Get(read_options, key, value_out); if (s.IsNotFound()) { log::trace("Database GetRaw()", "not found in storage engine"); return s; } else if (s.IsOK()) { log::trace("Database GetRaw()", "found in storage engine"); } else { log::trace("Database GetRaw()", "unidentified error"); return s; } // TODO-36: There is technical debt here: // 1. The uncompression should be able to proceed without having to call a // Multipart Reader. // 2. The uncompression should be able to operate within a single buffer, and // not have to copy data into intermediate buffers through the Multipart // Reader as it is done here. Having intermediate buffers means that there // is more data copy than necessary, thus more time wasted log::trace("Database GetRaw()", "Before Multipart - want_raw_data:%d value_out->is_compressed():%d", want_raw_data, value_out->is_compressed()); if (want_raw_data == false && value_out->is_compressed()) { if (value_out->size() > db_options_.internal__size_multipart_required) { return Status::MultipartRequired(); } char* buffer = new char[value_out->size()]; uint64_t offset = 0; MultipartReader mp_reader(read_options, *value_out); for (mp_reader.Begin(); mp_reader.IsValid(); mp_reader.Next()) { ByteArray part; mp_reader.GetPart(&part); log::trace("Database GetRaw()", "Multipart loop size:%d [%s]", part.size(), part.ToString().c_str()); memcpy(buffer + offset, part.data(), part.size()); offset += part.size(); } *value_out = NewShallowCopyByteArray(buffer, value_out->size()); } return s; } kdb::DatabaseOptions db_options_; std::string dbname_; kdb::StorageEngine* se_live_; kdb::StorageEngine* se_readonly_; uint32_t snapshot_id_; std::vector* fileids_iterator_; bool is_closed_; std::mutex mutex_close_; }; } // end namespace kdb #endif // KINGDB_SNAPSHOT_MAIN_H_ ================================================ FILE: network/client.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_CLIENT_H_ #define KINGDB_CLIENT_H_ #include "util/debug.h" #include #include #include #include #include #include #include #include #include #include "algorithm/murmurhash3.h" #include "util/status.h" #include "util/logger.h" #include "thread/threadpool.h" #define SIZE_BUFFER_CLIENT 1024*1024*1 // used by client to get data from server #define SIZE_LARGE_TEST_ITEMS 1024*1024*1 // size of large items used for testing #define MAX_RETRIES 2 #define RANDOM_DIST_LOWER_BOUND 256*1024 #define RANDOM_DIST_UPPER_BOUND 512*1024 //#define RANDOM_DIST_LOWER_BOUND 10*1024 //#define RANDOM_DIST_UPPER_BOUND 12*1024 //#define RANDOM_DIST_LOWER_BOUND 1*1024 //#define RANDOM_DIST_UPPER_BOUND (1*1024 + 200) namespace kdb { class Client { public: Client(std::string database) : memc(nullptr) { memc = memcached(database.c_str(), database.length()); //uint64_t value; //value = memcached_behavior_get(memc, MEMCACHED_BEHAVIOR_CONNECT_TIMEOUT); //printf("MEMCACHED_BEHAVIOR_CONNECT_TIMEOUT: %" PRIu64 "\n", value); memcached_behavior_set(memc, MEMCACHED_BEHAVIOR_CONNECT_TIMEOUT, 30000); memcached_behavior_set(memc, MEMCACHED_BEHAVIOR_POLL_TIMEOUT, 30000); memcached_behavior_set(memc, MEMCACHED_BEHAVIOR_RETRY_TIMEOUT, 100); } ~Client() { if (memc != nullptr) { memcached_free(memc); } } bool IsValid() { return (memc != nullptr); } uint64_t hash_function(const std::string& key) { static char hash[16]; static uint64_t output; MurmurHash3_x64_128(key.c_str(), key.size(), 0, hash); memcpy(&output, hash, 8); return output; } Status Get(const std::string& key, char **value_out, int *size_value) { char* buffer = new char[SIZE_BUFFER_CLIENT]; memcached_return_t rc; const char* keys[1]; keys[0] = key.c_str(); size_t key_length[]= {key.length()}; uint32_t flags; char return_key[MEMCACHED_MAX_KEY]; size_t return_key_length; char *return_value; size_t return_value_length; rc = memcached_mget(memc, keys, key_length, 1); if (rc != MEMCACHED_SUCCESS) { std::string msg = key + " " + memcached_strerror(memc, rc); return Status::IOError(msg); } while ((return_value = memcached_fetch(memc, return_key, &return_key_length, &return_value_length, &flags, &rc))) { memcpy(buffer, return_value, return_value_length); buffer[return_value_length] = '\0'; *value_out = buffer; *size_value = return_value_length; free(return_value); } if (rc == MEMCACHED_NOTFOUND) { return Status::NotFound("key: " + key); } else if (rc != MEMCACHED_END) { return Status::IOError(key + " " + memcached_strerror(memc, rc)); } return Status::OK(); } Status Put(const std::string& key, const std::string& value) { memcached_return_t rc = memcached_set(memc, key.c_str(), key.length(), value.c_str(), value.length(), (time_t)0, (uint32_t)0); if (rc != MEMCACHED_SUCCESS) { std::string msg = key + " " + memcached_strerror(memc, rc); return Status::IOError(msg); } return Status::OK(); } Status Put(const char* key, uint64_t size_key, const char *value, uint64_t size_value) { memcached_return_t rc = memcached_set(memc, key, size_key, value, size_value, (time_t)0, (uint32_t)0); if (rc != MEMCACHED_SUCCESS) { std::string msg = std::string(key) + " " + memcached_strerror(memc, rc); return Status::IOError(msg); } return Status::OK(); } Status Delete(const char* key, uint64_t size_key) { memcached_return_t rc = memcached_delete(memc, key, size_key, (time_t)0); if (rc != MEMCACHED_SUCCESS) { std::string msg = std::string(key) + " " + memcached_strerror(memc, rc); return Status::IOError(msg); } return Status::OK(); } private: memcached_st *memc; }; class ClientTask: public Task { public: ClientTask(std::string database, int num_writes, int num_removes, int num_reads) : database_(database), num_writes_(num_writes), num_removes_(num_removes), num_reads_(num_reads) { } virtual ~ClientTask() {} virtual void RunInLock(std::thread::id tid) { //std::cout << "Thread " << tid << std::endl; } virtual void Run(std::thread::id tid, uint64_t id) { Client client(database_); if (!client.IsValid()) { log::emerg("ClientTask", "Could not load the client"); return; } int size = SIZE_LARGE_TEST_ITEMS; char *buffer_large = new char[size+1]; for (auto i = 0; i < size; i++) { buffer_large[i] = 'a'; } buffer_large[size] = '\0'; std::chrono::high_resolution_clock::time_point start = std::chrono::high_resolution_clock::now(); Status s; std::seed_seq seq{1, 2, 3, 4, 5, 6, 7}; std::mt19937 generator(seq); std::uniform_int_distribution random_dist(RANDOM_DIST_LOWER_BOUND, RANDOM_DIST_UPPER_BOUND); for (auto i = 0; i < num_writes_; i++) { std::stringstream ss; ss << id << "-" << i; std::string key = ss.str(); int size_value = random_dist(generator); char *value = MakeValue(key, size_value); for (auto retry = 0; retry < MAX_RETRIES; retry++) { s = client.Put(ss.str().c_str(), ss.str().size(), value, size_value); if (s.IsOK()) { retry = MAX_RETRIES; } else { log::alert("ClientTask", "Put() Error for key [%s]: %s", key.c_str(), s.ToString().c_str()); //exit(-1); } if (retry >= MAX_RETRIES - 1) break; std::this_thread::sleep_for(std::chrono::milliseconds(1000)); log::debug("ClientTask", "retry key: [%s]", key.c_str()); } log::info("ClientTask", "Put(%s, size:%" PRIu64 ") - [%s]", ss.str().c_str(), size_value, s.ToString().c_str()); delete[] value; } std::mt19937 generator_remove(seq); std::uniform_int_distribution random_dist_remove(RANDOM_DIST_LOWER_BOUND, RANDOM_DIST_UPPER_BOUND); for (auto i = 0; i < num_removes_; i++) { std::stringstream ss; ss << id << "-" << i; std::string key = ss.str(); s = client.Delete(key.c_str(), key.size()); if (!s.IsOK()) { log::info("ClientTask", "Delete() Error for key [%s]: %s", key.c_str(), s.ToString().c_str()); } else { log::alert("ClientTask", "Delete() insert(key) %d %d", i, num_removes_); keys_removed.insert(key); } } std::mt19937 generator2(seq); std::uniform_int_distribution random_dist2(RANDOM_DIST_LOWER_BOUND, RANDOM_DIST_UPPER_BOUND); for (auto i = 0; i < num_reads_; i++) { std::stringstream ss; ss << id << "-" << i; std::string key = ss.str(); int size_value = random_dist2(generator2); auto it_find = keys_removed.find(key); bool has_item = false; if (it_find == keys_removed.end()) has_item = true; char *value = nullptr; int size_value_get; for (auto retry = 0; retry < MAX_RETRIES; retry++) { s = client.Get(key, &value, &size_value_get); if (!has_item) { if (s.IsNotFound()) { log::info("ClientTask", "Get() OK for removed key [%s]: %s", key.c_str(), s.ToString().c_str()); retry = MAX_RETRIES; } else { log::info("ClientTask", "Get() Error for removed key [%s]: %s", key.c_str(), s.ToString().c_str()); } } else if (!s.IsOK()) { log::info("ClientTask", "Get() Error for key [%s]: %s", key.c_str(), s.ToString().c_str()); } else { if (size_value != size_value_get) { log::info("ClientTask", "Found error in sizes for %s: [%d] [%d]", key.c_str(), size_value, size_value_get); } else { log::info("ClientTask", "Size OK for %s: [%d] [%d]", key.c_str(), size_value, size_value_get); int ret = VerifyValue(key, size_value, value); if (ret < 0) { log::info("ClientTask", "Found error in content for key [%s]", key.c_str()); } else { log::info("ClientTask", "Verified content of key [%s]", key.c_str()); retry = MAX_RETRIES; } } } if (retry >= MAX_RETRIES - 1) break; std::this_thread::sleep_for(std::chrono::milliseconds(5000)); log::info("ClientTask", "retry key: [%s]", key.c_str()); } delete[] value; } std::stringstream ss; std::chrono::high_resolution_clock::time_point end = std::chrono::high_resolution_clock::now(); uint64_t duration = std::chrono::duration_cast(end - start).count(); ss << "Done in " << duration << " ms"; log::info("ClientTask", "%s", ss.str().c_str()); delete[] buffer_large; } // use the hashes from MakeValue2() instead of the hashes from MakeValue() char* MakeValue(const std::string& key, int size_value) { int size_key = key.size(); char *str = new char[size_value+1]; str[size_value] = '\0'; int i = 0; for (i = 0; i < size_value / size_key; i++) { memcpy(str + i*size_key, key.c_str(), size_key); } if (size_value % size_key != 0) { memcpy(str + i*size_key, key.c_str(), size_value % size_key); } return str; } int VerifyValue(const std::string& key, int size_value, const char* value) { int size_key = key.size(); int i = 0; bool error = false; for (i = 0; i < size_value / size_key; i++) { if (memcmp(value + i*size_key, key.c_str(), size_key)) { std::string value2(value + i*size_key, size_key); printf("diff i:%d size:%d key:[%s], value:[%s]\n", i, size_key, key.c_str(), value2.c_str()); error = true; } } if (size_value % size_key != 0) { if (memcmp(value + i*size_key, key.c_str(), size_value % size_key)) { std::string value2(value, size_value % size_key); printf("diff remainder size:%d key:[%s], value:[%s]\n", size_value % size_key, key.c_str(), value2.c_str()); error = true; } } if (error) return -1; return 0; } char* MakeValue2(const std::string& key, int size_value) { static char hash[16]; MurmurHash3_x64_128(key.c_str(), key.size(), 0, hash); char *str = new char[size_value+1]; str[size_value] = '\0'; int i = 0; for (i = 0; i < size_value / 16; i++) { memcpy(str + i*16, hash, 16); } if (size_value % 16 != 0) { memcpy(str + i*16, hash, size_value % 16); } return str; } int VerifyValue2(const std::string& key, int size_value, const char* value) { static char hash[16]; MurmurHash3_x64_128(key.c_str(), key.size(), 0, hash); int i = 0; for (i = 0; i < size_value / 16; i++) { if (memcmp(value + i*16, hash, 16)) { std::string hash2(hash, 16); std::string value2(value, size_value); printf("diff key:[%s], hash:[%s] value:[%s]\n", key.c_str(), hash2.c_str(), value2.c_str()); return -1; } } if (size_value % 16 != 0) { if (memcmp(value + i*16, hash, size_value % 16)) { return -1; } } return 0; } std::string database_; int num_writes_; int num_reads_; int num_removes_; std::set keys_removed; }; }; #endif // KINGDB_CLIENT_H_ ================================================ FILE: network/client_main.cc ================================================ #include "network/client.h" #include void show_usage(char *program_name) { printf("Example: %s --host 127.0.0.1:11211 --num-threads 120 --write 10000 --remove 5000 --read 10000\n", program_name); } int main(int argc, char **argv) { if (argc == 1) { show_usage(argv[0]); exit(0); } if (argc % 2 == 0) { std::cerr << "Error: invalid number of arguments" << std::endl; show_usage(argv[0]); exit(-1); } std::string host(""); int num_threads = 0; int num_writes = 0; int num_removes = 0; int num_reads = 0; if (argc > 2) { for (int i = 1; i < argc; i += 2 ) { if (strcmp(argv[i], "--host" ) == 0) { host = "--SERVER=" + std::string(argv[i+1]); } else if (strcmp(argv[i], "--num-threads" ) == 0) { num_threads = atoi(argv[i+1]); } else if (strcmp(argv[i], "--write" ) == 0) { num_writes = atoi(argv[i+1]); } else if (strcmp(argv[i], "--remove" ) == 0) { num_removes = atoi(argv[i+1]); } else if (strcmp(argv[i], "--read" ) == 0) { num_reads = atoi(argv[i+1]); } else if (strcmp(argv[i], "--loglevel" ) == 0) { if (kdb::Logger::set_current_level(argv[i+1]) < 0 ) { fprintf(stderr, "Unknown log level: [%s]\n", argv[i+1]); exit(-1); } } else { fprintf(stderr, "Unknown parameter [%s]\n", argv[i]); exit(-1); } } } if (host == "" || num_threads == 0) { fprintf(stderr, "Missing arguments\n"); exit(-1); } kdb::ThreadPool tp(num_threads); tp.Start(); for (auto i = 0; i < num_threads; i++ ) { tp.AddTask(new kdb::ClientTask(host, num_writes, num_removes, num_reads)); if (i && i % 50 == 0) usleep(2 * 1000 * 1000); } tp.BlockUntilAllTasksHaveCompleted(); return 0; } ================================================ FILE: network/server.cc ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. // IMPORTANT: The KingServer code is just a hack that I have put together // to test KingDB over the network. I am aware that some of the // code is ill-formed and underperforming: I have deliberately // chosen to cut corners to get to a running solution fast, // and I will improve this code when and if needed. #include "network/server.h" namespace kdb { void NetworkTask::Run(std::thread::id tid, uint64_t id) { int bytes_received_last; std::regex regex_get {"get ([^\\s]*)"}; std::regex regex_put {"set ([^\\s]*) \\d* \\d* (\\d*)\r\n"}; std::regex regex_delete {"delete ([^\\s]*)"}; uint32_t bytes_received_buffer = 0; uint32_t bytes_received_total = 0; uint32_t bytes_expected = 0; uint64_t size_value = 0; uint64_t offset_value = 0; bool is_new = true; bool is_new_buffer = true; bool is_command_get = false; bool is_command_put = false; bool is_command_delete = false; char *buffer_send = new char[server_options_.internal__size_buffer_send]; ByteArray buffer; ByteArray key; log::trace("NetworkTask", "ENTER"); // TODO-7: replace the memory allocation performed for 'key' and 'buffer' by a // pool of pre-allocated buffers ReadOptions read_options; WriteOptions write_options; while (!IsStopRequested()) { // Receive the data log::trace("NetworkTask", "looping..."); if (is_new) { log::trace("NetworkTask", "is_new"); bytes_received_total = 0; bytes_expected = 0; size_value = 0; offset_value = 0; is_command_get = false; is_command_put = false; is_command_delete = false; } if (is_new_buffer) { log::trace("NetworkTask", "is_new_buffer"); bytes_received_buffer = 0; buffer = ByteArray::NewAllocatedMemoryByteArray(server_options_.recv_socket_buffer_size); log::trace("NetworkTask", "allocated"); } log::trace("NetworkTask", "Calling recv()"); bytes_received_last = recv(sockfd_, buffer.data() + bytes_received_buffer, server_options_.recv_socket_buffer_size - bytes_received_buffer, 0); if (bytes_received_last <= 0) { log::trace("NetworkTask", "recv()'d 0 bytes: breaking"); break; } bytes_received_buffer += bytes_received_last; bytes_received_total += bytes_received_last; //buffer.SetOffset(0, bytes_received_buffer); buffer.set_offset(0); buffer.set_size(bytes_received_buffer); log::trace("NetworkTask", "recv()'d %d bytes of data in buf - bytes_expected:%d bytes_received_buffer:%d bytes_received_total:%d", bytes_received_last, bytes_expected, bytes_received_buffer, bytes_received_total); // TODO: simplify the nested if-else blocks below to reduce // indentation levels if (is_new) { // Determine command type if (buffer.size() >= 3 && memcmp(buffer.data(), "get", 3) == 0) { is_command_get = true; } else if (buffer.size() >= 3 && memcmp(buffer.data(), "set", 3) == 0) { is_command_put = true; } else if (buffer.size() >= 6 && memcmp(buffer.data(), "delete", 6) == 0) { is_command_delete = true; log::trace("NetworkTask", "got delete command"); } else if (buffer.size() >= 4 && memcmp(buffer.data(), "quit", 4) == 0) { break; } // Determine bytes_expected if (is_command_put) { uint64_t offset_end_key = 4; // skipping 'set ' while (buffer.data()[offset_end_key] != ' ') offset_end_key++; key = buffer; key.set_offset(4); key.set_size(offset_end_key - 4); offset_value = offset_end_key; while (buffer.data()[offset_value] != '\n') offset_value++; offset_value++; // for the \n log::trace("NetworkTask", "offset_value %" PRIu64, offset_value); std::smatch matches; std::string str_buffer(buffer.data(), offset_value); if (std::regex_search(str_buffer, matches, regex_put)) { size_value = atoi(std::string(matches[2]).c_str()); bytes_expected = offset_value + size_value + 2; // +2: because of the final \r\n //std::string str_debug = std::string(matches[2]); //log::trace("NetworkTask", "[%s] expected [%s] [%" PRIu64 "]", key.ToString().c_str(), str_debug.c_str(), bytes_expected); } else { // should never happen, keeping it here until fully tested log::emerg("NetworkTask", "Could not match put command [%s]", str_buffer.c_str()); break; //exit(-1); } } else if ( bytes_received_last >= 2 && buffer.data()[bytes_received_last-2] == '\r' && buffer.data()[bytes_received_last-1] == '\n') { bytes_expected = bytes_received_last; } else { // should never happen, keeping it here until fully tested log::emerg("NetworkTask", "Don't know what to do with this new packet [%s]", buffer.ToString().c_str()); break; //exit(-1); } } is_new = false; // Loop and get more data from the network if the buffer is not full and all the data // hasn't arrived yet if ( bytes_received_total < bytes_expected && bytes_received_buffer < server_options_.recv_socket_buffer_size) { // TODO: what if the \r\n is on the two last messages, i.e. \n is the // first character of the last message? log::trace("NetworkTask", "force looping to get the rest of the data"); is_new_buffer = false; continue; } log::trace("NetworkTask", "not looping, storing current buffer"); if (is_command_get) { std::smatch matches; std::string str_buffer = buffer.ToString(); if (std::regex_search(str_buffer, matches, regex_get)) { buffer.set_offset(4); buffer.set_size(buffer.size() - 4 - 2); kdb::MultipartReader mp_reader = db_->NewMultipartReader(read_options, buffer); Status s = mp_reader.GetStatus(); if (s.IsOK()) { log::trace("NetworkTask", "GET: found"); int ret = snprintf(buffer_send, server_options_.internal__size_buffer_send, "VALUE %s 0 %" PRIu64 "\r\n", buffer.ToString().c_str(), mp_reader.size()); if (ret < 0 || (uint64_t)ret >= server_options_.internal__size_buffer_send) { log::emerg("NetworkTask", "Network send buffer is too small"); } log::trace("NetworkTask", "GET: buffer_send [%s]", buffer_send); if (send(sockfd_, buffer_send, strlen(buffer_send), 0) == -1) { log::trace("NetworkTask", "Error: send() - %s", strerror(errno)); break; } for (mp_reader.Begin(); mp_reader.IsValid(); mp_reader.Next()) { kdb::ByteArray part; kdb::Status s = mp_reader.GetPart(&part); if (!s.IsOK()) { log::trace("NetworkTask", "Error: MultipartReader - %s", s.ToString().c_str()); break; } if (send(sockfd_, part.data(), part.size(), 0) == -1) { log::trace("NetworkTask", "Error: send() - %s", strerror(errno)); } } Status s = mp_reader.GetStatus(); if (!s.IsOK()) { log::trace("NetworkTask", "Error - GetPart(): %s", s.ToString().c_str()); break; // drop the connection } if (send(sockfd_, "\r\nEND\r\n", 7, 0) == -1) { log::emerg("NetworkTask", "Error: send()", strerror(errno)); break; } } else { log::trace("NetworkTask", "GET: [%s]", s.ToString().c_str()); std::string msg = "NOT_FOUND\r\n"; if (send(sockfd_, msg.c_str(), msg.length(), 0) == -1) { log::emerg("NetworkTask", "Error: send() - %s", strerror(errno)); break; } } is_new = true; is_new_buffer = true; } else { log::emerg("NetworkTask", "Could not match Get command"); break; } } else if (is_command_delete) { std::smatch matches; std::string str_buffer = buffer.ToString(); if (std::regex_search(str_buffer, matches, regex_delete)) { buffer.set_offset(7); buffer.set_size(buffer.size() - 7 - 2); Status s = db_->Delete(write_options, buffer); if (s.IsOK()) { // TODO: check for [noreply], which may be present (see Memcached protocol specs) log::trace("NetworkTask", "REMOVE: ok"); if (send(sockfd_, "DELETED\r\n", 9, 0) == -1) { log::emerg("NetworkTask", "Error - send() %s", strerror(errno)); break; } } else { log::emerg("NetworkTask", "Delete() error: [%s]", s.ToString().c_str()); break; } is_new = true; is_new_buffer = true; } else { log::emerg("NetworkTask", "Could not match Delete command"); break; } } else if (is_command_put) { uint64_t offset_chunk; ByteArray chunk = buffer; if(bytes_received_total == bytes_received_buffer) { // chunk is a first part, need to skip all the characters before the value data chunk.set_offset(offset_value); chunk.set_size(bytes_received_buffer - offset_value); offset_chunk = 0; } else { chunk.set_offset(0); chunk.set_size(bytes_received_buffer); offset_chunk = bytes_received_total - bytes_received_buffer - offset_value; } if (bytes_received_total == bytes_expected) { // Part is a last part: in case this is the last buffer, the size of the // buffer needs to be adjusted to ignore the final \r\n chunk.set_size(chunk.size()-2); } if (chunk.size() > 0) { log::trace("NetworkTask", "call PutPart key [%s] bytes_received_buffer:%" PRIu64 " bytes_received_total:%" PRIu64 " bytes_expected:%" PRIu64 " size_chunk:%" PRIu64, key.ToString().c_str(), bytes_received_buffer, bytes_received_total, bytes_expected, chunk.size()); Status s = db_->PutPart(write_options, key, chunk, offset_chunk, size_value); if (!s.IsOK()) { log::trace("NetworkTask", "Error - Put(): %s", s.ToString().c_str()); } } if (bytes_received_total == bytes_expected) { is_new = true; log::trace("NetworkTask", "STORED key [%s] bytes_received_buffer:%" PRIu64 " bytes_received_total:%" PRIu64 " bytes_expected:%" PRIu64, key.ToString().c_str(), bytes_received_buffer, bytes_received_total, bytes_expected); if (send(sockfd_, "STORED\r\n", 8, 0) == -1) { log::emerg("NetworkTask", "Error - send() %s", strerror(errno)); break; } } is_new_buffer = true; } else { // for debugging log::emerg("NetworkTask", "Unknown case for buffer"); //exit(-1); } } log::trace("NetworkTask", "exit and close socket"); delete[] buffer_send; close(sockfd_); } void* Server::GetSockaddrIn(struct sockaddr *sa) { if (sa->sa_family == AF_INET) { return &(((struct sockaddr_in*)sa)->sin_addr); } return &(((struct sockaddr_in6*)sa)->sin6_addr); } Status Server::Start(ServerOptions& server_options, DatabaseOptions& db_options, std::string& dbname) { server_options_ = server_options; db_options_ = db_options; dbname_ = dbname; thread_network_ = std::thread(&Server::AcceptNetworkTraffic, this); return Status::OK(); } void Server::AcceptNetworkTraffic() { // Create the database object and the thread pool db_ = new kdb::Database(db_options_, dbname_); Status s = db_->Open(); if (!s.IsOK()) { log::emerg("Server", s.ToString().c_str()); stop_requested_ = true; return; } tp_ = new ThreadPool(server_options_.num_threads); tp_->Start(); log::trace("Server", "waiting for connections..."); // Ignoring SIGPIPE, which would crash the program when writing to // a broken socket -- doing this because MSG_NOSIGNAL doesn't work on Mac OS X signal(SIGPIPE, SIG_IGN); struct addrinfo ai_hints, *ai_server, *ai_ptr; memset(&ai_hints, 0, sizeof(ai_hints)); ai_hints.ai_family = AF_UNSPEC; ai_hints.ai_socktype = SOCK_STREAM; ai_hints.ai_flags = AI_PASSIVE; std::string str_port = std::to_string(server_options_.interface__memcached_port); int ret; if ((ret = getaddrinfo(NULL, str_port.c_str(), &ai_hints, &ai_server)) != 0) { log::emerg("Server", "getaddrinfo: %s", gai_strerror(ret)); stop_requested_ = true; return;// Status::IOError("Server - getaddrinfo", gai_strerror(ret)); } // Bind to the first result int sockfd_listen; for(ai_ptr = ai_server; ai_ptr != NULL; ai_ptr = ai_ptr->ai_next) { if ((sockfd_listen = socket(ai_ptr->ai_family, ai_ptr->ai_socktype, ai_ptr->ai_protocol)) == -1) { continue; } int setsockopt_yes=1; if (setsockopt(sockfd_listen, SOL_SOCKET, SO_REUSEADDR, &setsockopt_yes, sizeof(setsockopt_yes)) == -1) { log::emerg("Server", "setsockopt: %s", strerror(errno)); stop_requested_ = true; freeaddrinfo(ai_server); return;// Status::IOError("Server - setsockopt", strerror(errno)); } if (bind(sockfd_listen, ai_ptr->ai_addr, ai_ptr->ai_addrlen) == -1) { continue; } break; } freeaddrinfo(ai_server); if (ai_ptr == NULL) { log::emerg("Server", "Failed to bind()"); stop_requested_ = true; return;// Status::IOError("Server - Failed to bind"); } if (listen(sockfd_listen, server_options_.listen_backlog) == -1) { log::emerg("Server", "listen(): %s", strerror(errno)); stop_requested_ = true; return;// Status::IOError("Server - listen", strerror(errno)); } sockfd_listen_ = sockfd_listen; // Create notification pipe int pipefd[2]; if(pipe(pipefd) < 0) { stop_requested_ = true; return; } sockfd_notify_recv_ = pipefd[0]; sockfd_notify_send_ = pipefd[1]; fcntl(sockfd_notify_send_, F_SETFL, O_NONBLOCK); fd_set sockfds_read; int sockfd_max = std::max(sockfd_notify_recv_, sockfd_listen) + 1; // Start accepting connections int sockfd_accept; struct sockaddr_storage sockaddr_client; socklen_t size_sa; char address[INET6_ADDRSTRLEN]; while (!IsStopRequested()) { FD_ZERO(&sockfds_read); FD_SET(sockfd_notify_recv_, &sockfds_read); FD_SET(sockfd_listen, &sockfds_read); log::trace("Server", "select()"); size_sa = sizeof(sockaddr_client); int ret_select = select(sockfd_max, &sockfds_read, NULL, NULL, NULL); if (ret_select < 0) { log::emerg("Server", "select() error %s", strerror(errno)); stop_requested_ = true; return; } else if (ret_select == 0) { continue; } if (!FD_ISSET(sockfd_listen, &sockfds_read)) continue; log::trace("Server", "accept()"); sockfd_accept = accept(sockfd_listen, (struct sockaddr *)&sockaddr_client, &size_sa); if (sockfd_accept == -1) continue; inet_ntop(sockaddr_client.ss_family, GetSockaddrIn((struct sockaddr *)&sockaddr_client), address, sizeof(address)); log::trace("Server", "got connection from %s\n", address); tp_->AddTask(new NetworkTask(sockfd_accept, server_options_, db_)); } log::trace("Server", "Exiting thread"); } } // end of namespace kdb ================================================ FILE: network/server.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_SERVER_H_ #define KINGDB_SERVER_H_ #include "util/debug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "thread/threadpool.h" #include "interface/database.h" #include "interface/multipart.h" #include "util/byte_array.h" #include "util/options.h" #include "util/logger.h" namespace kdb { class NetworkTask: public Task { public: int sockfd_; kdb::ServerOptions server_options_; kdb::Database *db_; NetworkTask(int sockfd, kdb::ServerOptions server_options, kdb::Database* db) { sockfd_ = sockfd; server_options_ = server_options; db_ = db; } virtual ~NetworkTask() {}; virtual void RunInLock(std::thread::id tid) { //std::cout << "Thread " << tid << std::endl; } virtual void Run(std::thread::id tid, uint64_t id); }; class Server { public: Server() : stop_requested_(false), sockfd_listen_(0), sockfd_notify_recv_(0), sockfd_notify_send_(0), db_(nullptr), tp_(nullptr) {} Status Start(ServerOptions& server_options, DatabaseOptions& db_options, std::string& dbname); void AcceptNetworkTraffic(); bool IsStopRequested() { return stop_requested_; } void Stop() { log::trace("Server", "Stop()"); stop_requested_ = true; if (sockfd_notify_send_ > 0) { if (write(sockfd_notify_send_, "0", 1) < 0) { log::trace("Server", "Could not send the stop notification to the server thread: %s.", strerror(errno)); } } thread_network_.join(); if (tp_ != nullptr) { tp_->Stop(); delete tp_; } if (db_ != nullptr) { db_->Close(); delete db_; } if (sockfd_listen_ > 0) close(sockfd_listen_); if (sockfd_notify_recv_ > 0) close(sockfd_notify_recv_); if (sockfd_notify_send_ > 0) close(sockfd_notify_send_); } private: void* GetSockaddrIn(struct sockaddr *sa); bool stop_requested_; std::thread thread_network_; ServerOptions server_options_; DatabaseOptions db_options_; std::string dbname_; int sockfd_listen_; int sockfd_notify_recv_; int sockfd_notify_send_; kdb::Database* db_; ThreadPool *tp_; }; } // end of namespace kdb #endif // KINGDB_SERVER_H_ ================================================ FILE: network/server_main.cc ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. // #include #include #include "network/server.h" #include "thread/threadpool.h" #include "util/logger.h" #include "util/options.h" #include "util/file.h" #include "util/config_parser.h" static const uint32_t kVersionServerMajor = 0; static const uint32_t kVersionServerMinor = 9; static const uint32_t kVersionServerRevision = 0; static const uint32_t kVersionServerBuild = 0; bool stop_requested = false; void termination_signal_handler(int signal) { kdb::log::info("KingServer", "Received signal [%d]\n", signal); stop_requested = true; } void crash_signal_handler(int sig) { int depth_max = 20; void *array[depth_max]; size_t depth; depth = backtrace(array, depth_max); fprintf(stderr, "Error: signal %d:\n", sig); backtrace_symbols_fd(array, depth, STDERR_FILENO); exit(1); } int daemonize() { // Adapted from Michael Kerrisk's becomeDaemon() kdb::FileUtil::kingdb_getcwd(); // cache the current working directory // before the fork() happens, otherwise // getcwd() returns "/" on Mac OS X. // Become background process switch (fork()) { case -1: return -1; case 0: break; default: _exit(EXIT_SUCCESS); } if (setsid() == -1) return -1; // Ensure we are not session leader switch (fork()) { case -1: return -1; case 0: break; default: _exit(EXIT_SUCCESS); } umask(0); if (chdir("/") < 0) { fprintf(stderr, "chdir(): %s\n", strerror(errno)); } return 0; } int main(int argc, char** argv) { kdb::Status s; std::string dbname = ""; std::string configfile = ""; bool run_in_foreground = false; kdb::ServerOptions server_options; kdb::DatabaseOptions db_options; // Looking for '--configfile' kdb::ConfigParser parser_configfile; parser_configfile.error_if_unknown_parameters = false; parser_configfile.AddParameter(new kdb::StringParameter( "configfile", "", &configfile, false, "Configuration file. If not specified, the path ./kingdb.conf and /etc/kingdb.conf will be tested.")); s = parser_configfile.ParseCommandLine(argc, argv); if (!s.IsOK()) { fprintf(stderr, "%s\n", s.ToString().c_str()); exit(-1); } struct stat info; if (configfile == "") { if (stat("./kingdb.conf", &info) == 0) { configfile = "./kingdb.conf"; } else if (stat("/etc/kingdb.conf", &info) == 0) { configfile = "/etc/kingdb.conf"; } } else if (stat(configfile.c_str(), &info) != 0) { fprintf(stderr, "Could not file configuration file [%s]\n", configfile.c_str()); exit(-1); } // Now parsing all options kdb::ConfigParser parser; // General options parser.AddParameter(new kdb::StringParameter( "configfile", configfile, &configfile, false, "Configuration file. If not specified, the path ./kingdb.conf and /etc/kingdb.conf will be tested.")); parser.AddParameter(new kdb::FlagParameter( "foreground", &run_in_foreground, false, "When set, the server will run as a foreground process. By default, the server runs as a daemon process.")); parser.AddParameter(new kdb::StringParameter( "db.path", "", &dbname, true, "Path where the database can be found or will be created.")); kdb::DatabaseOptions::AddParametersToConfigParser(db_options, parser); kdb::ServerOptions::AddParametersToConfigParser(server_options, parser); // Overwrite the default value for the WriteBuffer mode parser.SetDefaultValue("db.write-buffer.mode", "adaptive"); if (argc == 2 && (strncmp(argv[1], "--help", 6) == 0 || strncmp(argv[1], "-h", 2) == 0)) { fprintf(stdout, "KingServer is a persisted key-value database server, which uses the KingDB library\nas a storage backend. For more information, visit http://kingdb.org\n"); fprintf(stdout, "KingServer version: %d.%d.%d-%d\nKingDB version: %d.%d.%d\nData format version: %d.%d\n", kVersionServerMajor, kVersionServerMinor, kVersionServerRevision, kVersionServerBuild, kdb::kVersionMajor, kdb::kVersionMinor, kdb::kVersionRevision, kdb::kVersionDataFormatMajor, kdb::kVersionDataFormatMinor); fprintf(stdout, "\nParameters:\n\n"); parser.PrintUsage(); exit(0); } if (argc == 2 && (strncmp(argv[1], "--generate-doc", 6) == 0 || strncmp(argv[1], "-h", 2) == 0)) { fprintf(stdout, "Generating the parameter list in markdown format for use in the documentation.\n\n"); parser.PrintMarkdown(); exit(0); } if (configfile != "") { s = parser.ParseFile(configfile); if (!s.IsOK()) { fprintf(stderr, "%s\n", s.ToString().c_str()); exit(-1); } } s = parser.ParseCommandLine(argc, argv); if (!s.IsOK()) { fprintf(stderr, "%s\n", s.ToString().c_str()); exit(-1); } if (!parser.FoundAllMandatoryParameters()) { parser.PrintAllMissingMandatoryParameters(); exit(-1); } if (db_options.log_level != "" && kdb::Logger::set_current_level(db_options.log_level.c_str()) < 0) { fprintf(stderr, "Unknown log level: [%s]\n", db_options.log_level.c_str()); exit(-1); } kdb::Logger::set_target(db_options.log_target); kdb::CompressionType ctype; if (db_options.storage__compression_algorithm == "disabled") { ctype = kdb::kNoCompression; } else if (db_options.storage__compression_algorithm == "lz4") { ctype = kdb::kLZ4Compression; } else { fprintf(stderr, "Unknown compression algorithm: [%s]\n", db_options.storage__compression_algorithm.c_str()); exit(-1); } db_options.compression = ctype; kdb::HashType htype; if (db_options.storage__hashing_algorithm == "xxhash-64") { htype = kdb::kxxHash_64; } else if (db_options.storage__hashing_algorithm == "murmurhash3-64") { htype = kdb::kMurmurHash3_64; } else { fprintf(stderr, "Unknown hashing algorithm: [%s]\n", db_options.storage__hashing_algorithm.c_str()); exit(-1); } db_options.hash = htype; kdb::WriteBufferMode wbm; if (db_options.write_buffer__mode_str == "direct") { wbm = kdb::kWriteBufferModeDirect; } else if (db_options.write_buffer__mode_str == "adaptive") { wbm = kdb::kWriteBufferModeAdaptive; } else { fprintf(stderr, "Unknown write buffer mode: [%s]\n", db_options.write_buffer__mode_str.c_str()); exit(-1); } db_options.write_buffer__mode = wbm; kdb::FileUtil::increase_limit_open_files(); #ifndef DEBUG #endif signal(SIGINT, termination_signal_handler); signal(SIGTERM, termination_signal_handler); signal(SIGSEGV, crash_signal_handler); signal(SIGABRT, crash_signal_handler); if (!run_in_foreground && daemonize() < 0) { fprintf(stderr, "Could not daemonize the process\n"); exit(-1); } kdb::Server server; server.Start(server_options, db_options, dbname); kdb::log::info("KingServer", "Daemon has started"); while (!stop_requested && !server.IsStopRequested()) { std::this_thread::sleep_for(std::chrono::milliseconds(500)); } server.Stop(); kdb::log::info("KingServer", "Daemon has stopped"); return 0; } ================================================ FILE: storage/format.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_FORMAT_H_ #define KINGDB_FORMAT_H_ #include "util/debug.h" #include #include #include #include #include #include #include #include #include #include "util/version.h" #include "util/logger.h" #include "util/status.h" #include "algorithm/coding.h" #include "algorithm/crc32c.h" #include "util/options.h" namespace kdb { static const uint32_t kVersionDataFormatMajor = 1; static const uint32_t kVersionDataFormatMinor = 0; // 32-bit flags // NOTE: kEntryFirst, kEntryMiddle and kEntryLast are not used yet, // they are reserved for possible future implementation. enum EntryHeaderFlag { kTypeDelete = 0x1, kIsUncompacted = 0x2, kHasPadding = 0x4, kEntryFull = 0x8, kEntryFirst = 0x10, kEntryMiddle = 0x20, kEntryLast = 0x40 }; struct EntryHeader { EntryHeader() { flags = 0; } uint8_t checksum_header; uint32_t checksum_content; uint32_t flags; uint64_t size_key; uint64_t size_value; uint64_t size_value_compressed; uint64_t size_padding; uint64_t hash; // Helpers, not store on secondary storage int32_t size_header_serialized; void print() { log::trace("EntryHeader::print()", "flags:%u checksum_content:0x%08" PRIx64 " size_key:%" PRIu64 " size_value:%" PRIu64 " size_value_compressed:%" PRIu64 " size_padding:%" PRIu64 " hash:0x%08" PRIx64, flags, checksum_content, size_key, size_value, size_value_compressed, size_padding, hash); } static uint64_t CalculatePaddingSize(uint64_t size_value) { // NOTE: Here I picked an arbitrary frame size of 64KB, and do an estimate // of the padding necessary for the frame headers based on the current size // of the value. // This logic is related to the compression algorithms, and therefore should // be moved to the compression classes. uint64_t size_frame_header = 8; return (size_value / (64*1024) + 1) * size_frame_header; } void SetHasPadding(bool b) { if (b) { flags |= kHasPadding; } else { flags &= ~kHasPadding; } } bool HasPadding() { return (flags & kHasPadding); } void SetIsUncompacted(bool b) { if (b) { flags |= kIsUncompacted; } else { flags &= ~kIsUncompacted; } } bool IsUncompacted() { return (flags & kIsUncompacted); } void SetTypeDelete() { flags |= kTypeDelete; } void SetTypePut() { // do nothing } bool AreSizesValid(uint32_t offset_file, uint64_t filesize) { return ( size_key > 0 && offset_file + size_header_serialized + size_key + size_value_offset() <= filesize); } bool IsTypeDelete() { log::trace("IsTypeDelete()", "flags %u", flags); return (flags & kTypeDelete); } bool IsTypePut() { return !IsTypeDelete(); } void SetEntryFull() { flags |= kEntryFull; } bool IsEntryFull() { log::trace("IsEntryFull()", "flags %u", flags); return (flags & kEntryFull); } bool IsCompressed() { return (size_value_compressed > 0); } uint64_t size_value_used() { if (IsCompressed()) { return size_value_compressed; } else { return size_value; } } uint64_t size_value_offset() { if (!IsCompressed() || IsUncompacted()) { return size_value + size_padding; } else { return size_value_compressed; } } static Status DecodeFrom(const DatabaseOptions& db_options, const ReadOptions& read_options, const char* buffer_in, uint64_t num_bytes_max, struct EntryHeader *output, uint32_t *num_bytes_read) { /* // Dumb serialization for debugging log::trace("EntryHeader::DecodeFrom", "start num_bytes_max:%" PRIu64 " - sizeof(EntryHeader):%d", num_bytes_max, sizeof(struct EntryHeader)); char *buffer = const_cast(buffer_in); struct EntryHeader* entry_header = reinterpret_cast(buffer); *output = *entry_header; *num_bytes_read = sizeof(struct EntryHeader); return Status::OK(); */ int length; char *buffer = const_cast(buffer_in); char *ptr = buffer; int size = num_bytes_max; output->checksum_header = ptr[0]; ptr += 1; size -= 1; GetFixed32(ptr, &(output->checksum_content)); ptr += 4; size -= 4; length = GetVarint32(ptr, size, &(output->flags)); if (length == -1) return Status::IOError("Decoding error"); ptr += length; size -= length; length = GetVarint64(ptr, size, &(output->size_key)); if (length == -1) return Status::IOError("Decoding error"); ptr += length; size -= length; length = GetVarint64(ptr, size, &(output->size_value)); if (length == -1) return Status::IOError("Decoding error"); ptr += length; size -= length; if (db_options.compression.type != kNoCompression) { GetFixed64(ptr, &(output->size_value_compressed)); ptr += 8; size -= 8; length = GetVarint64(ptr, size, &(output->size_padding)); if (length == -1) return Status::IOError("Decoding error"); ptr += length; size -= length; } else { output->size_value_compressed = 0; output->size_padding = 0; } if (size < 8) return Status::IOError("Decoding error"); GetFixed64(ptr, &(output->hash)); ptr += 8; size -= 8; *num_bytes_read = num_bytes_max - size; output->size_header_serialized = *num_bytes_read; if (read_options.verify_checksums) { uint8_t checksum_header = crc32c::crc8(0, buffer + 1, output->size_header_serialized - 1); if (checksum_header != output->checksum_header) { return Status::IOError("Header checksum mismatch"); } } //log::trace("EntryHeader::DecodeFrom", "size:%u", *num_bytes_read); return Status::OK(); } static uint32_t EncodeTo(const DatabaseOptions& db_options, const struct EntryHeader *input, char* buffer) { /* // Dumb serialization for debugging struct EntryHeader *input_noncast = const_cast(input); memcpy(buffer, reinterpret_cast(input_noncast), sizeof(struct EntryHeader)); return sizeof(struct EntryHeader); */ // NOTE: it would be interesting to run an analysis and determine if it is // better to store the crc32 and hash using fixed encoding or varints. For // the hash, it will certainly be specific to each hash function. char *ptr = buffer + 1; // save 1 byte for the header checksum EncodeFixed32(ptr, input->checksum_content); ptr = EncodeVarint32(ptr + 4, input->flags); ptr = EncodeVarint64(ptr, input->size_key); ptr = EncodeVarint64(ptr, input->size_value); if (db_options.compression.type != kNoCompression) { EncodeFixed64(ptr, input->size_value_compressed); ptr += 8; ptr = EncodeVarint64(ptr, input->size_padding); } EncodeFixed64(ptr, input->hash); ptr += 8; buffer[0] = crc32c::crc8(0, buffer + 1, (ptr-buffer) - 1); //log::trace("EntryHeader::EncodeTo", "size:%u", ptr - buffer); //PrintHex(buffer, ptr - buffer); return (ptr - buffer); } }; struct EntryFooter { // NOTE: at first I wanted to have the CRC32 as part of an entry footer, the // since the compressed size has to be written in the header anyway, the // header has to be written twice, thus having a footer is not necessary. // I'm keeping this here to keep the idea in mind. uint32_t crc32; }; struct DatabaseOptionEncoder { static Status DecodeFrom(const char* buffer_in, uint64_t num_bytes_max, struct DatabaseOptions *output) { if (num_bytes_max < GetFixedSize()) return Status::IOError("Decoding error"); uint32_t crc32_computed = crc32c::Value(buffer_in + 4, GetFixedSize() - 4); uint32_t crc32_stored; GetFixed32(buffer_in, &crc32_stored); if (crc32_computed != crc32_stored) return Status::IOError("Invalid checksum"); uint32_t version_data_format_major, version_data_format_minor; GetFixed32(buffer_in + 20, &version_data_format_major); GetFixed32(buffer_in + 24, &version_data_format_minor); if ( version_data_format_major != kVersionDataFormatMajor || version_data_format_minor != kVersionDataFormatMinor) { return Status::IOError("Data format version not supported"); } uint32_t hash, compression_type, checksum_type; GetFixed64(buffer_in + 28, &(output->storage__hstable_size)); GetFixed32(buffer_in + 36, &hash); GetFixed32(buffer_in + 40, &compression_type); GetFixed32(buffer_in + 44, &checksum_type); if (hash == 0x0) { output->hash = kMurmurHash3_64; } else if (hash == 0x1) { output->hash = kxxHash_64; } else { return Status::IOError("Unknown hash type"); } if (compression_type == 0x0) { output->compression.type = kNoCompression; } else if (compression_type == 0x1) { output->compression.type = kLZ4Compression; } else { return Status::IOError("Unknown compression type"); } if (compression_type == 0x1) { output->checksum = kCRC32C; } else { return Status::IOError("Unknown checksum type"); } return Status::OK(); } static uint32_t EncodeTo(const struct DatabaseOptions *input, char* buffer) { EncodeFixed32(buffer + 4, kVersionMajor); EncodeFixed32(buffer + 8, kVersionMinor); EncodeFixed32(buffer + 12, kVersionRevision); EncodeFixed32(buffer + 16, kVersionBuild); EncodeFixed32(buffer + 20, kVersionDataFormatMajor); EncodeFixed32(buffer + 24, kVersionDataFormatMinor); EncodeFixed64(buffer + 28, input->storage__hstable_size); EncodeFixed32(buffer + 36, (uint32_t)input->hash); EncodeFixed32(buffer + 40, (uint32_t)input->compression.type); EncodeFixed32(buffer + 44, (uint32_t)input->checksum); uint32_t crc32 = crc32c::Value(buffer + 4, GetFixedSize() - 4); EncodeFixed32(buffer, crc32); return GetFixedSize(); } static uint32_t GetFixedSize() { return 48; // in bytes } }; enum FileType { kUnknownType = 0x0, kUncompactedRegularType = 0x1, kCompactedRegularType = 0x2, kCompactedLargeType = 0x4, }; struct HSTableHeader { uint32_t crc32; uint32_t version_major; uint32_t version_minor; uint32_t version_revision; uint32_t version_build; uint32_t version_data_format_major; uint32_t version_data_format_minor; uint32_t filetype; uint64_t timestamp; HSTableHeader() { filetype = 0; } FileType GetFileType() { if (filetype & kCompactedLargeType) { return kCompactedLargeType; } else if (filetype & kCompactedRegularType) { return kCompactedRegularType; } else if (filetype & kUncompactedRegularType) { return kUncompactedRegularType; } return kUnknownType; } bool IsTypeLarge() { return (filetype & kCompactedLargeType); } bool IsTypeCompacted() { return ( filetype & kCompactedRegularType || filetype & kCompactedLargeType); } bool IsFileVersionSupported() { return ( version_data_format_major == kVersionDataFormatMajor && version_data_format_minor == kVersionDataFormatMinor); } bool IsFileVersionNewer() { if ( version_data_format_major > kVersionDataFormatMajor || ( version_data_format_major == kVersionDataFormatMajor && version_data_format_minor > kVersionDataFormatMinor) ) { return true; } return false; } static Status DecodeFrom(const char* buffer_in, uint64_t num_bytes_max, struct HSTableHeader *output, struct DatabaseOptions *db_options_out=nullptr) { if (num_bytes_max < GetFixedSize()) return Status::IOError("Decoding error"); GetFixed32(buffer_in , &(output->crc32)); GetFixed32(buffer_in + 4, &(output->version_data_format_major)); GetFixed32(buffer_in + 8, &(output->version_data_format_minor)); GetFixed32(buffer_in + 12, &(output->filetype)); GetFixed64(buffer_in + 16, &(output->timestamp)); uint32_t crc32_computed = crc32c::Value(buffer_in + 4, 20); if (!output->IsFileVersionSupported()) return Status::IOError("Data format version not supported"); if (crc32_computed != output->crc32) return Status::IOError("Invalid checksum"); if (db_options_out == nullptr) return Status::OK(); Status s = DatabaseOptionEncoder::DecodeFrom(buffer_in + GetFixedSize(), num_bytes_max - GetFixedSize(), db_options_out); return s; } static uint32_t EncodeTo(const struct HSTableHeader *input, const struct DatabaseOptions* db_options, char* buffer) { EncodeFixed32(buffer + 4, kVersionDataFormatMajor); EncodeFixed32(buffer + 8, kVersionDataFormatMinor); EncodeFixed32(buffer + 12, input->filetype); EncodeFixed64(buffer + 16, input->timestamp); uint32_t crc32 = crc32c::Value(buffer + 4, 20); EncodeFixed32(buffer, crc32); int size_db_options = DatabaseOptionEncoder::EncodeTo(db_options, buffer + GetFixedSize()); return GetFixedSize() + size_db_options; } static uint32_t GetFixedSize() { return 24; // in bytes } }; enum HSTableFooterFlags { kHasPaddingInValues = 0x1, // 1 if some values have size_value space but only use size_value_compressed and therefore need compaction, 0 otherwise kHasInvalidEntries = 0x2 // 1 if some values have erroneous content that needs to be washed out in a compaction process -- will be set to 1 during a file recovery }; struct HSTableFooter { uint32_t filetype; uint32_t flags; uint64_t offset_indexes; uint64_t num_entries; uint64_t magic_number; uint32_t crc32; HSTableFooter() { flags = 0; filetype = 0; } bool IsTypeLarge() { return (filetype & kCompactedLargeType); } bool IsTypeCompacted() { return ( filetype & kCompactedRegularType || filetype & kCompactedLargeType); } void SetFlagHasPaddingInValues() { flags |= kHasPaddingInValues; } void SetFlagHasInvalidEntries() { flags |= kHasInvalidEntries; } static Status DecodeFrom(const char* buffer_in, uint64_t num_bytes_max, struct HSTableFooter *output) { if (num_bytes_max < GetFixedSize()) return Status::IOError("Decoding error"); GetFixed32(buffer_in, &(output->filetype)); GetFixed32(buffer_in + 4, &(output->flags)); GetFixed64(buffer_in + 8, &(output->offset_indexes)); GetFixed64(buffer_in + 16, &(output->num_entries)); GetFixed64(buffer_in + 24, &(output->magic_number)); GetFixed32(buffer_in + 32, &(output->crc32)); return Status::OK(); } static uint32_t EncodeTo(const struct HSTableFooter *input, char* buffer) { EncodeFixed32(buffer, input->filetype); EncodeFixed32(buffer + 4, input->flags); EncodeFixed64(buffer + 8, input->offset_indexes); EncodeFixed64(buffer + 16, input->num_entries); EncodeFixed64(buffer + 24, input->magic_number); // the checksum is computed in the method that writes the footer return GetFixedSize(); } static uint32_t GetFixedSize() { return 36; // in bytes } }; struct OffsetArrayRow { uint64_t hashed_key; uint32_t offset_entry; static Status DecodeFrom(const char* buffer_in, uint64_t num_bytes_max, struct OffsetArrayRow *output, uint32_t *num_bytes_read) { int length; char *ptr = const_cast(buffer_in); int size = num_bytes_max; length = GetVarint64(ptr, size, &(output->hashed_key)); if (length == -1) return Status::IOError("Decoding error"); ptr += length; size -= length; length = GetVarint32(ptr, size, &(output->offset_entry)); if (length == -1) return Status::IOError("Decoding error"); ptr += length; size -= length; *num_bytes_read = num_bytes_max - size; return Status::OK(); } static uint32_t EncodeTo(const struct OffsetArrayRow *input, char* buffer) { char *ptr; ptr = EncodeVarint64(buffer, input->hashed_key); ptr = EncodeVarint32(ptr, input->offset_entry); return (ptr - buffer); } }; } // namespace kdb #endif // KINGDB_FORMAT_H_ ================================================ FILE: storage/hstable_manager.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_HSTABLE_MANAGER_H_ #define KINGDB_HSTABLE_MANAGER_H_ #include "util/debug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util/options.h" #include "util/order.h" #include "util/byte_array.h" #include "util/file.h" #include "algorithm/crc32c.h" #include "algorithm/hash.h" #include "storage/format.h" #include "storage/resource_manager.h" namespace kdb { // A HSTable (Hashed String Table) is a file consisting of entries, followed by // an Offset Array. The entries are a sequence of bytes in the form , // and for each entry, the Offset Array has one item which is the hashed key of // that entry, and the offset where the entry can be found in the file. // The Offset Array can be used to quickly build a hash table in memory, // mapping hashed keys to locations in HSTables. class HSTableManager { public: HSTableManager() { is_closed_ = true; is_read_only_ = true; has_file_ = false; buffer_has_items_ = false; has_sync_option_ = false; } HSTableManager(DatabaseOptions& db_options, std::string dbname, std::string prefix, std::string prefix_compaction, std::string dirpath_locks, FileType filetype_default, bool read_only=false) : db_options_(db_options), is_read_only_(read_only), filetype_default_(filetype_default), fileid_(0), sequence_fileid_(0), sequence_timestamp_(0), prefix_(prefix), prefix_compaction_(prefix_compaction), dirpath_locks_(dirpath_locks), wait_until_can_open_new_files_(false) { log::trace("HSTableManager::HSTableManager()", "dbname:%s prefix:%s", dbname.c_str(), prefix.c_str()); dbname_ = dbname; hash_ = MakeHash(db_options.hash); Reset(); if (!is_read_only_) { buffer_raw_ = new char[size_block_*2]; buffer_index_ = new char[size_block_*2]; } } ~HSTableManager() { Close(); } void Reset() { file_resource_manager.Reset(); sequence_fileid_ = 0; sequence_timestamp_ = 0; size_block_ = db_options_.storage__hstable_size; has_file_ = false; buffer_has_items_ = false; is_closed_ = false; is_locked_sequence_timestamp_ = false; offset_start_ = 0; offset_end_ = 0; } void Close() { std::unique_lock lock(mutex_close_); if (is_read_only_ || is_closed_) return; is_closed_ = true; FlushCurrentFile(); CloseCurrentFile(); delete hash_; if (!is_read_only_) { delete[] buffer_raw_; delete[] buffer_index_; } } std::string GetPrefix() { return prefix_; } std::string GetFilepath(uint32_t fileid) { return dbname_ + "/" + prefix_ + HSTableManager::num_to_hex(fileid); // TODO: optimize here } std::string GetLockFilepath(uint32_t fileid) { return dirpath_locks_ + "/" + HSTableManager::num_to_hex(fileid); // TODO: optimize here } // File id sequence helpers void SetSequenceFileId(uint32_t seq) { std::unique_lock lock(mutex_sequence_fileid_); sequence_fileid_ = seq; log::trace("HSTableManager::SetSequenceFileId", "seq:%u", seq); } uint32_t GetSequenceFileIdForStableId() { std::unique_lock lock(mutex_sequence_fileid_); uint64_t fileid = sequence_fileid_; if (!has_file_) fileid += 1; return fileid; } uint32_t GetSequenceFileId() { std::unique_lock lock(mutex_sequence_fileid_); return sequence_fileid_; } uint32_t IncrementSequenceFileId(uint32_t inc) { std::unique_lock lock(mutex_sequence_fileid_); log::trace("HSTableManager::IncrementSequenceFileId", "sequence_fileid_:%u, inc:%u", sequence_fileid_, inc); sequence_fileid_ += inc; return sequence_fileid_; } // Timestamp sequence helpers void SetSequenceTimestamp(uint32_t seq) { std::unique_lock lock(mutex_sequence_timestamp_); if (!is_locked_sequence_timestamp_) sequence_timestamp_ = seq; } uint64_t GetSequenceTimestamp() { std::unique_lock lock(mutex_sequence_timestamp_); return sequence_timestamp_; } uint64_t IncrementSequenceTimestamp(uint64_t inc) { std::unique_lock lock(mutex_sequence_timestamp_); if (!is_locked_sequence_timestamp_) sequence_timestamp_ += inc; return sequence_timestamp_; } void LockSequenceTimestamp(uint64_t seq) { std::unique_lock lock(mutex_sequence_timestamp_); is_locked_sequence_timestamp_ = true; sequence_timestamp_ = seq; } static std::string num_to_hex(uint64_t num) { char buffer[20]; sprintf(buffer, "%08" PRIx64, num); return std::string(buffer); } static uint32_t hex_to_num(char* hex) { uint32_t num; sscanf(hex, "%x", &num); return num; } uint32_t GetHighestStableFileId(uint32_t fileid_start) { // TODO: Extract the HSTable repair logic out of this method. This method // should only be computing the highest stable file id, and not do anything // else than that. I took this implementation shortcut to get the first beta // version out asap, this needs to be cleaned up at some point. uint32_t fileid_max = GetSequenceFileIdForStableId(); //GetSequenceFileId(); uint32_t fileid_stable = 0; uint32_t fileid_candidate = fileid_start; uint64_t epoch_now = file_resource_manager.GetEpochNow(); while (true) { if (fileid_candidate >= fileid_max) break; uint32_t num_writes = file_resource_manager.GetNumWritesInProgress(fileid_candidate); int fd = 0; if (num_writes > 0) { uint64_t epoch = file_resource_manager.GetEpochLastActivity(fileid_candidate); if (epoch > epoch_now - db_options_.storage__inactivity_timeout) { // The in-progress writes for this file haven't timed out yet, thus it // is not stable yet. break; } else { // The file epoch is such that the in-progress writes to the file have // timed out. All temporary data is cleared: future incoming writes to // this file will fail, and at the next startup, its internal index // will be recovered. And this is what we want: we don't want to // recover the file now, the recovery process should only run at // database startup. // TODO-37: cleanup key_to_location and key_to_headersize for all keys // that belong to the file id being cleaned up. std::string filepath = GetFilepath(fileid_candidate); bool is_file_large = file_resource_manager.IsFileLarge(fileid_candidate); // Remove large files that are detected as being inactive if (is_file_large) { if (std::remove(filepath.c_str()) != 0) { log::emerg("GetHighestStableFileId()", "Could not remove large file [%s]", filepath.c_str()); } file_resource_manager.ClearTemporaryDataForFileId(fileid_candidate); goto handle_next_file; } // Write the OffsetArray for this inactive file log::trace("HSTableManager::GetHighestStableFileId()", "About to write Offset Array"); if ((fd = open(filepath.c_str(), O_WRONLY, 0644)) < 0) { log::emerg("HSTableManager::GetHighestStableFileId()", "Could not open file [%s]: %s", filepath.c_str(), strerror(errno)); goto handle_next_file; } // TODO: factorize this code with FlushOffsetArray() uint64_t filesize_before = file_resource_manager.GetFileSize(fileid_candidate); if (ftruncate(fd, filesize_before) < 0) { log::emerg("HSTableManager::GetHighestStableFileId()", "Error ftruncate(): %s", strerror(errno)); goto handle_next_file; } uint64_t size_offarray; Status s = WriteOffsetArray(fd, file_resource_manager.GetOffsetArray(fileid_candidate), &size_offarray, filetype_default_, file_resource_manager.HasPaddingInValues(fileid_candidate), true); if (!s.IsOK()) { log::emerg("HSTableManager::GetHighestStableFileId()", "Error on WriteOffsetArray(): %s", s.ToString().c_str()); goto handle_next_file; } uint64_t filesize = file_resource_manager.GetFileSize(fileid_candidate); file_resource_manager.SetFileSize(fileid_candidate, filesize + size_offarray); file_resource_manager.ClearTemporaryDataForFileId(fileid_candidate); } } fileid_stable = fileid_candidate; handle_next_file: if (fd != 0) close(fd); fileid_candidate += 1; } return fileid_stable; } void OpenNewFile() { log::trace("HSTableManager::OpenNewFile()", "Opening file (before) [%s]: %u", filepath_.c_str(), GetSequenceFileId()); IncrementSequenceFileId(1); IncrementSequenceTimestamp(1); filepath_ = GetFilepath(GetSequenceFileId()); log::trace("HSTableManager::OpenNewFile()", "Opening file [%s]: %u", filepath_.c_str(), GetSequenceFileId()); while (true) { if ((fd_ = open(filepath_.c_str(), O_WRONLY|O_CREAT, 0644)) < 0) { log::emerg("HSTableManager::OpenNewFile()", "Could not open file [%s]: %s", filepath_.c_str(), strerror(errno)); wait_until_can_open_new_files_ = true; std::this_thread::sleep_for(std::chrono::milliseconds(db_options_.internal__open_file_retry_delay)); continue; } wait_until_can_open_new_files_ = false; break; } has_file_ = true; fileid_ = GetSequenceFileId(); timestamp_ = GetSequenceTimestamp(); // Reserving space for header offset_start_ = 0; offset_end_ = db_options_.internal__hstable_header_size; // Filling in default header struct HSTableHeader hstheader; hstheader.filetype = filetype_default_; hstheader.timestamp = timestamp_; HSTableHeader::EncodeTo(&hstheader, &db_options_, buffer_raw_); } bool CanOpenNewFiles() { return !wait_until_can_open_new_files_; } void CloseCurrentFile() { if (!has_file_) return; log::trace("HSTableManager::CloseCurrentFile()", "ENTER - fileid_:%d", fileid_); // The offarray should only be written if there are no more incoming writes to // the current file. If there are still writes in progress, the offarray should not // be written so that the next database start-up can trigger a recovery process. // Same goes with files that had in-progress writes but timed out: their // offarray will not be written so that they will be recovered at start-up. FlushOffsetArray(); close(fd_); buffer_has_items_ = false; has_file_ = false; } uint32_t FlushCurrentFile(int force_new_file=0, uint64_t padding=0) { if (!has_file_) return 0; uint32_t fileid_out = fileid_; log::trace("HSTableManager::FlushCurrentFile()", "ENTER - fileid_:%d, has_file_:%d, buffer_has_items_:%d", fileid_, has_file_, buffer_has_items_); if (has_file_ && buffer_has_items_) { log::trace("HSTableManager::FlushCurrentFile()", "has_files && buffer_has_items_ - fileid_:%d", fileid_); if (write(fd_, buffer_raw_ + offset_start_, offset_end_ - offset_start_) < 0) { log::emerg("HSTableManager::FlushCurrentFile()", "Error write(): %s", strerror(errno)); return 0; } file_resource_manager.SetFileSize(fileid_, offset_end_); offset_start_ = offset_end_; buffer_has_items_ = false; log::trace("HSTableManager::FlushCurrentFile()", "items written - offset_end_:%d | size_block_:%d | force_new_file:%d", offset_end_, size_block_, force_new_file); } if (padding) { offset_end_ += padding; offset_start_ = offset_end_; file_resource_manager.SetFileSize(fileid_, offset_end_); if (ftruncate(fd_, offset_end_) < 0) { log::emerg("HSTableManager::FlushCurrentFile()", "Error ftruncate(): %s", strerror(errno)); return 0; } if (lseek(fd_, 0, SEEK_END) < 0) { log::emerg("HSTableManager::FlushCurrentFile()", "Error lseek(): %s", strerror(errno)); return 0; } } if (has_sync_option_) { has_sync_option_ = false; if (FileUtil::sync_file(fd_) < 0) { log::emerg("HSTableManager::FlushCurrentFile()", "Error sync_file(): %s", strerror(errno)); } } if (offset_end_ >= size_block_ || (force_new_file && offset_end_ > db_options_.internal__hstable_header_size)) { log::trace("HSTableManager::FlushCurrentFile()", "file renewed - force_new_file:%d", force_new_file); file_resource_manager.SetFileSize(fileid_, offset_end_); CloseCurrentFile(); //OpenNewFile(); } else { //fileid_out = fileid_out - 1; } log::trace("HSTableManager::FlushCurrentFile()", "done!"); return fileid_out; } Status FlushOffsetArray() { if (!has_file_) return Status::OK(); uint32_t num = file_resource_manager.GetNumWritesInProgress(fileid_); log::trace("HSTableManager::FlushOffsetArray()", "ENTER - fileid_:%d - num_writes_in_progress:%u", fileid_, num); if (file_resource_manager.GetNumWritesInProgress(fileid_) == 0) { uint64_t size_offarray; file_resource_manager.SetFileSize(fileid_, offset_end_); if (ftruncate(fd_, offset_end_) < 0) { return Status::IOError("HSTableManager::FlushOffsetArray()", strerror(errno)); } Status s = WriteOffsetArray(fd_, file_resource_manager.GetOffsetArray(fileid_), &size_offarray, filetype_default_, file_resource_manager.HasPaddingInValues(fileid_), false); uint64_t filesize = file_resource_manager.GetFileSize(fileid_); file_resource_manager.SetFileSize(fileid_, filesize + size_offarray); return s; } return Status::OK(); } Status WriteOffsetArray(int fd, const std::vector< std::pair >& offarray_current, uint64_t* size_out, FileType filetype, bool has_padding_in_values, bool has_invalid_entries) { uint64_t offset = 0; struct OffsetArrayRow row; for (auto& p: offarray_current) { row.hashed_key = p.first; row.offset_entry = p.second; uint32_t length = OffsetArrayRow::EncodeTo(&row, buffer_index_ + offset); offset += length; log::trace("HSTableManager::WriteOffsetArray()", "hashed_key:[0x%" PRIx64 "] offset:[0x%08x]", p.first, p.second); } int64_t position = lseek(fd, 0, SEEK_END); if (position < 0) { return Status::IOError("HSTableManager::WriteOffsetArray()", strerror(errno)); } log::trace("HSTableManager::WriteOffsetArray()", "file position:[%" PRIu64 "]", position); struct HSTableFooter footer; footer.filetype = filetype; footer.offset_indexes = position; footer.num_entries = offarray_current.size(); footer.magic_number = get_magic_number(); if (has_padding_in_values) footer.SetFlagHasPaddingInValues(); if (has_invalid_entries) footer.SetFlagHasInvalidEntries(); uint32_t length = HSTableFooter::EncodeTo(&footer, buffer_index_ + offset); offset += length; uint32_t crc32 = crc32c::Value(buffer_index_, offset - 4); EncodeFixed32(buffer_index_ + offset - 4, crc32); if (write(fd, buffer_index_, offset) < 0) { log::trace("HSTableManager::WriteOffsetArray()", "Error write(): %s", strerror(errno)); } // ftruncate() is necessary in case the file system space for the file was pre-allocated if (ftruncate(fd, position + offset) < 0) { return Status::IOError("HSTableManager::WriteOffsetArray()", strerror(errno)); } *size_out = offset; log::trace("HSTableManager::WriteOffsetArray()", "offset_indexes:%u, num_entries:[%lu]", position, offarray_current.size()); return Status::OK(); } uint64_t WriteFirstPartLargeOrder(Order& order, uint64_t hashed_key) { // If a large order is self-contained, it will still be split into parts, // and therefore the opearations on the first and last parts will be done // as expected. See notes in WriteOrdersAndFlushFile() for more information. // TODO-30: large files should be pre-allocated. The problem here is that // the streaming interface needs to work over a network, thus the // pre-allocation can't block or take too long. uint64_t fileid_largefile = IncrementSequenceFileId(1); uint64_t timestamp_largefile = IncrementSequenceTimestamp(1); std::string filepath = GetFilepath(fileid_largefile); log::trace("HSTableManager::WriteFirstPartLargeOrder()", "filepath:[%s] key:[%s] tid:[0x%08" PRIx64 "]", filepath.c_str(), order.key.ToString().c_str(), order.tid); int fd = 0; if ((fd = open(filepath.c_str(), O_WRONLY|O_CREAT, 0644)) < 0) { log::emerg("HSTableManager::WriteFirstPartLargeOrder()", "Could not open file [%s]: %s", filepath.c_str(), strerror(errno)); return 0; } // Write hstable header char buffer[db_options_.internal__hstable_header_size]; struct HSTableHeader hstheader; hstheader.filetype = kCompactedLargeType; hstheader.timestamp = timestamp_largefile; HSTableHeader::EncodeTo(&hstheader, &db_options_, buffer); if (write(fd, buffer, db_options_.internal__hstable_header_size) < 0) { log::emerg("HSTableManager::WriteFirstPartLargeOrder()", "Error write(): %s", strerror(errno)); return 0; } // Write entry header struct EntryHeader entry_header; entry_header.SetTypePut(); entry_header.SetEntryFull(); entry_header.size_key = order.key.size(); entry_header.size_value = order.size_value; entry_header.size_value_compressed = order.size_value_compressed; entry_header.size_padding = 0; entry_header.hash = hashed_key; entry_header.checksum_header = 0; entry_header.checksum_content = 0; entry_header.SetIsUncompacted(false); entry_header.SetHasPadding(false); uint32_t size_header = EntryHeader::EncodeTo(db_options_, &entry_header, buffer); key_to_headersize[order.tid][order.key.ToString()] = size_header; if (write(fd, buffer, size_header) < 0) { log::emerg("HSTableManager::WriteFirstPartLargeOrder()", "Error write(): %s", strerror(errno)); return 0; } // Write key and part // NOTE: Could also put the key and part in the buffer and do a single write if (write(fd, order.key.data(), order.key.size()) < 0) { log::emerg("HSTableManager::WriteFirstPartLargeOrder()", "Error write(): %s", strerror(errno)); return 0; } if (write(fd, order.chunk.data(), order.chunk.size()) < 0) { log::emerg("HSTableManager::WriteFirstPartLargeOrder()", "Error write(): %s", strerror(errno)); return 0; } uint64_t filesize = db_options_.internal__hstable_header_size + size_header + order.key.size() + order.size_value; if (ftruncate(fd, filesize) < 0) { log::emerg("HSTableManager::WriteFirstPartLargeOrder()", "Error ftruncate(): %s", strerror(errno)); return 0; } if (order.write_options.sync && FileUtil::sync_file(fd) < 0) { log::emerg("HSTableManager::WriteFirstPartLargeOrder()", "Error sync_file(): %s", strerror(errno)); } file_resource_manager.SetFileSize(fileid_largefile, filesize); close(fd); uint64_t fileid_shifted = fileid_largefile; fileid_shifted <<= 32; uint64_t location = fileid_shifted | db_options_.internal__hstable_header_size; log::trace("HSTableManager::WriteFirstPartLargeOrder()", "fileid [%d] location: [%" PRIu64 "]", fileid_largefile, location); file_resource_manager.SetNumWritesInProgress(fileid_largefile, 1); file_resource_manager.AddOffsetArray(fileid_largefile, std::pair(hashed_key, db_options_.internal__hstable_header_size)); return location; } uint64_t WriteMiddleOrLastPart(Order& order, uint64_t hashed_key, uint64_t location) { uint32_t fileid = (location & 0xFFFFFFFF00000000) >> 32; uint32_t offset_file = location & 0x00000000FFFFFFFF; std::string filepath = GetFilepath(fileid); if (fileid != fileid_ && file_resource_manager.GetNumWritesInProgress(fileid) == 0) { // This file is not the lastest file, and it has no writes in progress. // The file was either closed or the writes timed out, therefore do nothing return 0; } log::trace("HSTableManager::WriteMiddleOrLastPart()", "key [%s] filepath:[%s] offset_chunk:%" PRIu64, order.key.ToString().c_str(), filepath.c_str(), order.offset_chunk); int fd = 0; if ((fd = open(filepath.c_str(), O_WRONLY, 0644)) < 0) { log::emerg("HSTableManager::WriteMiddleOrLastPart()", "Could not open file [%s]: %s", filepath.c_str(), strerror(errno)); return 0; } if (key_to_headersize.find(order.tid) == key_to_headersize.end() || key_to_headersize[order.tid].find(order.key.ToString()) == key_to_headersize[order.tid].end()) { log::trace("HSTableManager::WriteMiddleOrLastPart()", "Missing in key_to_headersize[]"); } uint32_t size_header = key_to_headersize[order.tid][order.key.ToString()]; // Write the part if (pwrite(fd, order.chunk.data(), order.chunk.size(), offset_file + size_header + order.key.size() + order.offset_chunk) < 0) { log::trace("HSTableManager::WriteMiddleOrLastPart()", "Error pwrite(): %s", strerror(errno)); } // If this is a last part, the header is written again to save // the right size of compressed value along with the checksums. if (order.IsLastPart()) { log::trace("HSTableManager::WriteMiddleOrLastPart()", "Write compressed size: [%s] - size:%" PRIu64 ", compressed size:%" PRIu64 " crc32:0x%08" PRIx64, order.key.ToString().c_str(), order.size_value, order.size_value_compressed, order.crc32); struct EntryHeader entry_header; entry_header.SetTypePut(); entry_header.SetEntryFull(); entry_header.size_key = order.key.size(); entry_header.size_value = order.size_value; entry_header.size_value_compressed = order.size_value_compressed; entry_header.size_padding = order.IsLarge() ? 0 : EntryHeader::CalculatePaddingSize(order.size_value); entry_header.hash = hashed_key; entry_header.checksum_content = order.crc32; if (!order.IsLarge() && entry_header.IsCompressed()) { // NOTE: entry_header.IsCompressed() makes no sense since compression is // handled at database level, not at entry level. All usages of // IsCompressed() should be replaced by a check on the database options. entry_header.SetIsUncompacted(true); file_resource_manager.SetHasPaddingInValues(fileid_, true); entry_header.SetHasPadding(true); } //entry_header.print(); char buffer[sizeof(struct EntryHeader)*2]; uint32_t size_header_new = EntryHeader::EncodeTo(db_options_, &entry_header, buffer); //log::trace("HSTableManager::WriteMiddleOrLastPart()", "CRC32 header: 0x%02" PRIx8, checksum_header); //entry_header.print(); if (size_header_new != size_header) { log::emerg("HSTableManager::WriteMiddleOrLastPart()", "Error of encoding: the initial header had a size of %u, and it is now %u. The entry is now corrupted.", size_header, size_header_new); return 0; } if (pwrite(fd, buffer, size_header, offset_file) < 0) { log::emerg("HSTableManager::WriteMiddleOrLastPart()", "Error pwrite(): %s", strerror(errno)); return 0; } if (order.IsLarge() && entry_header.IsCompressed()) { uint64_t filesize = db_options_.internal__hstable_header_size + size_header + order.key.size() + order.size_value_compressed; file_resource_manager.SetFileSize(fileid, filesize); if (ftruncate(fd, filesize) < 0) { log::emerg("HSTableManager::WriteMiddleOrLastPart()", "Error ftruncate(): %s", strerror(errno)); return 0; } } uint32_t num_writes_in_progress = file_resource_manager.SetNumWritesInProgress(fileid, -1); if (fileid != fileid_ && num_writes_in_progress == 0) { // TODO: factorize this code with FlushOffsetArray() log::trace("HSTableManager::WriteMiddleOrLastPart()", "About to write Offset Array"); uint64_t size_offarray; FileType filetype = order.IsLarge() ? kCompactedLargeType : filetype_default_; uint64_t filesize_before = file_resource_manager.GetFileSize(fileid); if (ftruncate(fd, filesize_before) < 0) { log::emerg("HSTableManager::WriteMiddleOrLastPart()", "Error ftruncate(): %s", strerror(errno)); return 0; } Status s = WriteOffsetArray(fd, file_resource_manager.GetOffsetArray(fileid), &size_offarray, filetype, file_resource_manager.HasPaddingInValues(fileid), false); if (!s.IsOK()) { log::emerg("HSTableManager::WriteMiddleOrLastPart()", "Error on WriteOffsetArray(): %s", s.ToString().c_str()); return 0; } uint64_t filesize = file_resource_manager.GetFileSize(fileid); file_resource_manager.SetFileSize(fileid, filesize + size_offarray); if (order.IsLarge()) file_resource_manager.SetFileLarge(fileid); file_resource_manager.ClearTemporaryDataForFileId(fileid); } } if (order.write_options.sync && FileUtil::sync_file(fd) < 0) { log::emerg("HSTableManager::WriteMiddleOrLastPart()", "Error sync_file(): %s", strerror(errno)); } close(fd); log::trace("HSTableManager::WriteMiddleOrLastPart()", "all good"); return location; } uint64_t WriteFirstPartOrSmallOrder(Order& order, uint64_t hashed_key) { if (order.write_options.sync) { has_sync_option_ = true; } uint64_t location_out = 0; struct EntryHeader entry_header; if (order.type == OrderType::Put) { entry_header.SetTypePut(); entry_header.SetEntryFull(); entry_header.size_key = order.key.size(); entry_header.size_value = order.size_value; entry_header.size_value_compressed = order.size_value_compressed; entry_header.hash = hashed_key; entry_header.checksum_content = order.crc32; if (order.IsSelfContained()) { entry_header.SetIsUncompacted(false); entry_header.SetHasPadding(false); entry_header.size_padding = 0; } else { entry_header.SetIsUncompacted(true); file_resource_manager.SetHasPaddingInValues(fileid_, true); entry_header.SetHasPadding(true); entry_header.size_padding = EntryHeader::CalculatePaddingSize(order.size_value); // TODO: check that the has_padding_in_values field in fields is used during compaction } uint32_t size_header = EntryHeader::EncodeTo(db_options_, &entry_header, buffer_raw_ + offset_end_); /* if (order.IsSelfContained()) { size_header = EntryHeader::EncodeTo(db_options_, &entry_header, buffer_raw_ + offset_end_); log::trace("HSTableManager::WriteFirstPartOrSmallOrder()", "IsSelfContained():true - crc32 [0x%08x]", entry_header.crc32); } */ memcpy(buffer_raw_ + offset_end_ + size_header, order.key.data(), order.key.size()); memcpy(buffer_raw_ + offset_end_ + size_header + order.key.size(), order.chunk.data(), order.chunk.size()); //map_index[order.key] = fileid_ | offset_end_; uint64_t fileid_shifted = fileid_; fileid_shifted <<= 32; location_out = fileid_shifted | offset_end_; file_resource_manager.AddOffsetArray(fileid_, std::pair(hashed_key, offset_end_)); offset_end_ += size_header + order.key.size() + order.chunk.size(); if (!order.IsSelfContained()) { key_to_headersize[order.tid][order.key.ToString()] = size_header; log::trace("HSTableManager::WriteFirstPartOrSmallOrder()", "BEFORE fileid_ %u", fileid_); file_resource_manager.SetNumWritesInProgress(fileid_, 1); FlushCurrentFile(0, entry_header.size_value_offset() - order.chunk.size()); // NOTE: A better way to do it would be to copy things into the buffer, and // then for the other parts, either copy in the buffer if the position // to write is >= offset_end_, or do a pwrite() if the position is < // offset_end_ // NOTE: might be better to lseek() instead of doing a large write // NOTE: No longer necessary to do the lseek() here, as I'm doing it in // the FlushCurrentFile() //offset_end_ += order.size_value - order.size_chunk; //FlushCurrentFile(); //ftruncate(fd_, offset_end_); //lseek(fd_, 0, SEEK_END); log::trace("HSTableManager::WriteFirstPartOrSmallOrder()", "AFTER fileid_ %u", fileid_); } log::trace("HSTableManager::WriteFirstPartOrSmallOrder()", "Put [%s]", order.key.ToString().c_str()); } else { // order.type == OrderType::Delete log::trace("HSTableManager::WriteFirstPartOrSmallOrder()", "Delete [%s]", order.key.ToString().c_str()); entry_header.SetTypeDelete(); entry_header.SetEntryFull(); entry_header.size_key = order.key.size(); entry_header.size_value = 0; entry_header.size_value_compressed = 0; entry_header.checksum_content = order.crc32; uint32_t size_header = EntryHeader::EncodeTo(db_options_, &entry_header, buffer_raw_ + offset_end_); memcpy(buffer_raw_ + offset_end_ + size_header, order.key.data(), order.key.size()); uint64_t fileid_shifted = fileid_; fileid_shifted <<= 32; location_out = fileid_shifted | offset_end_; file_resource_manager.AddOffsetArray(fileid_, std::pair(hashed_key, offset_end_)); offset_end_ += size_header + order.key.size(); } return location_out; } void WriteOrdersAndFlushFile(std::vector& orders, std::multimap& map_index_out) { for (auto& order: orders) { if (offset_end_ > size_block_) { log::trace("HSTableManager::WriteOrdersAndFlushFile()", "About to flush - offset_end_: %" PRIu64 " | size_key: %d | size_value: %d | size_block_: %" PRIu64, offset_end_, order.key.size(), order.size_value, size_block_); FlushCurrentFile(true, 0); } if (!has_file_) OpenNewFile(); uint64_t hashed_key = hash_->HashFunction(order.key.data(), order.key.size()); // TODO-13: if the item is self-contained (unique part), then no need to // have size_value space, size_value_compressed is enough. // TODO-12: If the db is embedded, then all order are self contained, // independently of their sizes. Would the compression and CRC32 still // work? Would storing the data (i.e. choosing between the different // storing functions) still work? // There are three categories of entries: // - Small entries: sizes within [0, server.recv_socket_buffer_size) // - Medium entries: sizes within [server.recv_socket_buffer_size, hstable.maximum_size) // - Large entries: sizes greater than hstable.maximum_size // // When using the storage engine through a network interface, medium and // large entries are split into parts of size at most server.recv_socket_buffer_size, // making them "multipart" entries. // Small entries do not need to be split, and are therefore "self-contained". // Parts are held into "orders", which hold extra metadata needed // for various steps of the storage process. // There are three types of parts, based on their positions in the data // stream: first part, middle part, and last part. Different operations // need to be completed on an order depending on the type of part it // contains. // // When using the storage engine embedded in another program, orders can be // on any size, and because it is embedded, the data can be sent as is to // the storage engine, potentially in a very large buffer, larger than // the size of server.recv_socket_buffer_size contrained when on a network. // Because the logic in the storage engine expects first and last parts, // a large order that is at the same time a first *and* a last part could // cause an issue: the order could be treated only as a first part, // and the operations triggered by the arrival of the last part // may not be done. To solve that problem, and because compression // and hash functions take input of limited sizes anyway, the constant // 'maximum_part_size' has been introduced. As part of the // Database::PutPart() method, the sizes of incoming orders are checked, // and if they are larger than 'maximum_part_size', they are split // into smaller parts. This is done in such a way that any // self-contained large entry would be split, therefore guaranteeing // that that the operations done by both the first and last parts // are triggered. // // For performance reasons, the small and medium entries incoming during // the same time period are grouped together in a buffer and written // at once to a "regular" HSTable. Large entries are written to their own // HSTable, referred to as "large" HSTable. // 1. The order is the first part of a large entry, so we create a // large HSTable and write the first part in there uint64_t location = 0; if (order.IsLarge() && order.IsFirstPart()) { // TODO-11: shouldn't this be testing size_value_compressed as well? -- yes, only if the order // is a full entry by itself (will happen when the kvstore will be embedded and not accessed // through the network), otherwise we don't know yet what the total compressed size will be. location = WriteFirstPartLargeOrder(order, hashed_key); // 2. The order is a middle or last part, so we open the HSTable, // pwrite() the part, and close the HSTable } else if (order.IsMiddleOrLastPart()) { // TODO-11: replace the tests on compression "order.size_value_compressed ..." by a real test on a flag or a boolean // TODO-11: replace the use of size_value or size_value_compressed by a unique size() which would already return the right value if (key_to_location.find(order.tid) == key_to_location.end()) { location = 0; } else { location = key_to_location[order.tid][order.key.ToString()]; } if (location != 0) { WriteMiddleOrLastPart(order, hashed_key, location); } else { log::emerg("HSTableManager", "Avoided catastrophic location error (in case 2) key:[%s] tid:[0x%08" PRIx64 "]", order.key.ToString().c_str(), order.tid); for (auto& p: key_to_location[order.tid]) { log::emerg("HSTableManager", "key:%s value:%" PRIu64, p.first.c_str(), p.second); } } // 3. The order is a self-contained small part, or a first part // for a medium entry, thus it is added to the current buffer and // is written to the latest on-going HSTable } else { buffer_has_items_ = true; location = WriteFirstPartOrSmallOrder(order, hashed_key); } // Traces int caseid = 0; if (order.IsLarge() && order.IsFirstPart()) { caseid = 1; } else if (order.IsMiddleOrLastPart()) { caseid = 2; } else { caseid = 3; } log::trace("HSTableManager::WriteOrdersAndFlushFile()", "%d. key: [%s] size_chunk:%" PRIu64 " offset_chunk: %" PRIu64, caseid, order.key.ToString().c_str(), order.chunk.size(), order.offset_chunk); // If the order is self-contained or a last part, // add his location to the output map_index_out[] if (order.IsSelfContained() || order.IsLastPart()) { log::trace("HSTableManager::WriteOrdersAndFlushFile()", "END OF ORDER key: [%s] size_chunk:%" PRIu64 " offset_chunk: %" PRIu64 " location:%" PRIu64, order.key.ToString().c_str(), order.chunk.size(), order.offset_chunk, location); if (location != 0) { map_index_out.insert(std::pair(hashed_key, location)); } else { log::emerg("HSTableManager", "Avoided catastrophic location error (post-processing last part)"); } if (key_to_location.find(order.tid) != key_to_location.end()) { key_to_location[order.tid].erase(order.key.ToString()); } if (key_to_headersize.find(order.tid) != key_to_headersize.end()) { key_to_headersize[order.tid].erase(order.key.ToString()); } // Else, if the order is not self-contained and is a first part, // the location is saved in key_to_location[] } else if (order.IsFirstPart()) { if (location != 0 && order.type != OrderType::Delete) { key_to_location[order.tid][order.key.ToString()] = location; log::trace("HSTableManager", "location saved: [%" PRIu64 "]", location); } else { log::trace("HSTableManager", "Avoided catastrophic location error (post-processing first part)"); } } } log::trace("HSTableManager::WriteOrdersAndFlushFile()", "end flush"); FlushCurrentFile(0, 0); } static Status LoadDatabaseOptionsFromHSTables(std::string& dbname, DatabaseOptions* db_options_out, std::string& prefix_compaction) { // Careful here, code duplication: all of the directory walking and // file selection was taken from LoadDatabase() log::trace("HSTableManager::LoadDatabaseOptionsFromHSTables()", "Start"); char filepath[FileUtil::maximum_path_size()]; DIR *directory; struct dirent *entry; struct stat info; if ((directory = opendir(dbname.c_str())) == NULL) { return Status::IOError("Could not open database directory", dbname.c_str()); } bool found_valid_db_options = false; while ((entry = readdir(directory)) != NULL) { if (strcmp(entry->d_name, DatabaseOptions::GetFilename().c_str()) == 0) continue; if (strcmp(entry->d_name, prefix_compaction.c_str()) == 0) continue; int ret = snprintf(filepath, FileUtil::maximum_path_size(), "%s/%s", dbname.c_str(), entry->d_name); if (ret < 0 || ret >= FileUtil::maximum_path_size()) { log::emerg("HSTableManager::LoadDatabaseOptionsFromHSTables()", "Filepath buffer is too small, could not build the filepath string for file [%s]", entry->d_name); continue; } if (stat(filepath, &info) != 0 || !(info.st_mode & S_IFREG)) continue; // Yes, using the default internal__hstable_header_size value from the // object this method is meant to return. if (info.st_size <= (off_t)db_options_out->internal__hstable_header_size) { log::trace("HSTableManager::LoadDatabaseOptionsFromHSTables()", "file: [%s] only has a header or less, skipping\n", entry->d_name); continue; } Mmap mmap(filepath, info.st_size); if (!mmap.is_valid()) return Status::IOError("Mmap constructor failed"); struct HSTableHeader hstheader; struct DatabaseOptions db_options; Status s = HSTableHeader::DecodeFrom(mmap.datafile(), mmap.filesize(), &hstheader, &db_options); if (s.IsOK()) { *db_options_out = db_options; found_valid_db_options = true; break; } else { log::trace("HSTableManager::LoadDatabaseOptionsFromHSTables()", "file: [%s] has an invalid header, skipping\n", entry->d_name); } } if (found_valid_db_options) { return Status::OK(); } else { return Status::IOError("Could not find any HSTable with a valid database option backup."); } } Status LoadDatabase(std::string& dbname, std::multimap& index_se, std::set* fileids_ignore=nullptr, uint32_t fileid_end=0, std::vector* fileids_iterator=nullptr) { Status s; struct stat info; if (!is_read_only_) { if ( stat(dirpath_locks_.c_str(), &info) != 0 && mkdir(dirpath_locks_.c_str(), 0755) < 0) { return Status::IOError("Could not create lock directory", strerror(errno)); } /* if(!(info.st_mode & S_IFDIR)) { return Status::IOError("A file with same name as the lock directory already exists and is not a directory. Delete or rename this file to continue.", dirpath_locks_.c_str()); } */ s = FileUtil::remove_files_with_prefix(dbname_.c_str(), prefix_compaction_); if (!s.IsOK()) return Status::IOError("Could not clean up previous compaction"); s = DeleteAllLockedFiles(dbname_); if (!s.IsOK()) return Status::IOError("Could not clean up snapshots"); s = FileUtil::remove_files_with_prefix(dirpath_locks_.c_str(), ""); if (!s.IsOK()) return Status::IOError("Could not clean up locks"); } DIR *directory; struct dirent *entry; if ((directory = opendir(dbname.c_str())) == NULL) { return Status::IOError("Could not open database directory", dbname.c_str()); } // Sort the fileids by , so that puts and removes can be // applied in the right order. // Indeed, imagine that we have files with ids from 1 to 100, and a // compaction process operating on files 1 through 50. The files 1-50 are // going to be compacted and the result of this compaction written // to ids 101 and above, which means that even though the entries in // files 101 and above are older than the entries in files 51-100, they are // in files with greater ids. Thus, the file ids cannot be used as a safe // way to order the entries in a set of files, and we need to have a sequence id // which will allow all other processes to know what is the order of // the entries in a set of files, which is why we have a 'timestamp' in each // file. As a consequence, the sequence id is the concatenation of // the 'timestamp' and the 'fileid'. // As the compaction process will always include at least one uncompacted // file, the maximum timestamp is garanteed to be always increasing and no // overlapping will occur. std::map timestamp_fileid_to_fileid; char filepath[FileUtil::maximum_path_size()]; char buffer_key[64]; // buffer used to order HSTables when loading a database, // shouldn't need more than 33 bytes, but rounded up uint32_t fileid_max = 0; uint64_t timestamp_max = 0; uint32_t fileid = 0; while ((entry = readdir(directory)) != NULL) { if (strcmp(entry->d_name, DatabaseOptions::GetFilename().c_str()) == 0) continue; if (strcmp(entry->d_name, prefix_compaction_.c_str()) == 0) continue; int ret = snprintf(filepath, FileUtil::maximum_path_size(), "%s/%s", dbname.c_str(), entry->d_name); if (ret < 0 || ret >= FileUtil::maximum_path_size()) { log::emerg("HSTableManager::LoadDatabase()", "Filepath buffer is too small, could not build the filepath string for file [%s]", entry->d_name); continue; } if (stat(filepath, &info) != 0 || !(info.st_mode & S_IFREG)) continue; fileid = HSTableManager::hex_to_num(entry->d_name); if ( fileids_ignore != nullptr && fileids_ignore->find(fileid) != fileids_ignore->end()) { log::trace("HSTableManager::LoadDatabase()", "Skipping file in fileids_ignore:: [%s] [%lld] [%u]\n", entry->d_name, info.st_size, fileid); continue; } if (fileid_end != 0 && fileid > fileid_end) { log::trace("HSTableManager::LoadDatabase()", "Skipping file with id larger than fileid_end (%u): [%s] [%lld] [%u]\n", fileid, entry->d_name, info.st_size, fileid); continue; } log::trace("HSTableManager::LoadDatabase()", "file: [%s] [%lld] [%u]\n", entry->d_name, info.st_size, fileid); if (info.st_size <= (off_t)db_options_.internal__hstable_header_size) { log::trace("HSTableManager::LoadDatabase()", "file: [%s] only has a header or less, skipping\n", entry->d_name); continue; } Mmap mmap(filepath, info.st_size); if (!mmap.is_valid()) return Status::IOError("Mmap constructor failed"); struct HSTableHeader hstheader; Status s = HSTableHeader::DecodeFrom(mmap.datafile(), mmap.filesize(), &hstheader); if (!s.IsOK()) { log::trace("HSTableManager::LoadDatabase()", "file: [%s] has an invalid header, skipping\n", entry->d_name); continue; } sprintf(buffer_key, "%016" PRIx64 "-%016x", hstheader.timestamp, fileid); std::string key(buffer_key); timestamp_fileid_to_fileid[key] = fileid; fileid_max = std::max(fileid_max, fileid); timestamp_max = std::max(timestamp_max, hstheader.timestamp); } for (auto& p: timestamp_fileid_to_fileid) { uint32_t fileid = p.second; if (fileids_iterator != nullptr) fileids_iterator->push_back(fileid); std::string filepath = GetFilepath(fileid); log::trace("HSTableManager::LoadDatabase()", "Loading file:[%s] with key:[%s]", filepath.c_str(), p.first.c_str()); if (stat(filepath.c_str(), &info) != 0) continue; Mmap mmap(filepath.c_str(), info.st_size); if (!mmap.is_valid()) return Status::IOError("Mmap constructor failed"); uint64_t filesize; bool is_file_large, is_file_compacted; s = LoadFile(mmap, fileid, index_se, &filesize, &is_file_large, &is_file_compacted); if (s.IsOK()) { file_resource_manager.SetFileSize(fileid, filesize); if (is_file_large) file_resource_manager.SetFileLarge(fileid); if (is_file_compacted) file_resource_manager.SetFileCompacted(fileid); } else if (!s.IsOK() && !is_read_only_) { log::warn("HSTableManager::LoadDatabase()", "Could not load index in file [%s], entering recovery mode", filepath.c_str()); s = RecoverFile(mmap, fileid, index_se); if (!s.IsOK()) { log::warn("HSTableManager::LoadDatabase()", "Recovery failed for file [%s]", filepath.c_str()); mmap.Close(); if (std::remove(filepath.c_str()) != 0) { log::emerg("HSTableManager::LoadDatabase()", "Could not remove file [%s]", filepath.c_str()); } } } } if (fileid_max > 0) { SetSequenceFileId(fileid_max); SetSequenceTimestamp(timestamp_max); } closedir(directory); return Status::OK(); } static Status LoadFile(Mmap& mmap, uint32_t fileid, std::multimap& index_se, uint64_t *filesize_out=nullptr, bool *is_file_large_out=nullptr, bool *is_file_compacted_out=nullptr) { log::trace("LoadFile()", "Loading [%s] of size:%u, sizeof(HSTableFooter):%u", mmap.filepath(), mmap.filesize(), HSTableFooter::GetFixedSize()); struct HSTableFooter footer; Status s; s = HSTableFooter::DecodeFrom(mmap.datafile() + mmap.filesize() - HSTableFooter::GetFixedSize(), HSTableFooter::GetFixedSize(), &footer); if (!s.IsOK()) return s; if (footer.magic_number != HSTableManager::get_magic_number()) { log::trace("LoadFile()", "Skipping [%s] - magic_number:[%" PRIu64 "/%" PRIu64 "]", mmap.filepath(), footer.magic_number, get_magic_number()); return Status::IOError("Invalid footer"); } uint32_t crc32_computed = crc32c::Value(mmap.datafile() + footer.offset_indexes, mmap.filesize() - footer.offset_indexes - 4); if (crc32_computed != footer.crc32) { log::trace("LoadFile()", "Skipping [%s] - Invalid CRC32:[%08x/%08x]", mmap.filepath(), footer.crc32, crc32_computed); return Status::IOError("Invalid footer"); } log::trace("LoadFile()", "Footer OK"); // The file has a clean footer, load all the offsets in the index uint64_t offset_index = footer.offset_indexes; struct OffsetArrayRow row; for (uint64_t i = 0; i < footer.num_entries; i++) { uint32_t length_row = 0; s = OffsetArrayRow::DecodeFrom(mmap.datafile() + offset_index, mmap.filesize() - offset_index, &row, &length_row); if (!s.IsOK()) return s; uint64_t fileid_shifted = fileid; fileid_shifted <<= 32; index_se.insert(std::pair(row.hashed_key, fileid_shifted | row.offset_entry)); log::trace("LoadFile()", "Add item to index -- hashed_key:[0x%" PRIx64 "] offset:[%u] -- offset_index:[%" PRIu64 "]", row.hashed_key, row.offset_entry, offset_index); offset_index += length_row; } if (filesize_out) *filesize_out = mmap.filesize(); if (is_file_large_out) *is_file_large_out = footer.IsTypeLarge() ? true : false; if (is_file_compacted_out) *is_file_compacted_out = footer.IsTypeCompacted() ? true : false; log::trace("LoadFile()", "Loaded [%s] num_entries:[%" PRIu64 "]", mmap.filepath(), footer.num_entries); return Status::OK(); } Status RecoverFile(Mmap& mmap, uint32_t fileid, std::multimap& index_se) { uint32_t offset = db_options_.internal__hstable_header_size; std::vector< std::pair > offarray_current; bool has_padding_in_values = false; bool has_invalid_entries = false; struct HSTableHeader hstheader; Status s = HSTableHeader::DecodeFrom(mmap.datafile(), mmap.filesize(), &hstheader); // 1. If the file is a large file, just discard it if (!s.IsOK() || hstheader.IsTypeLarge()) { return Status::IOError("Could not recover file"); } // 2. If the file is a hstable, go over all its entries and verify each one of them while (true) { struct EntryHeader entry_header; uint32_t size_header; ReadOptions read_options; read_options.verify_checksums = true; Status s = EntryHeader::DecodeFrom(db_options_, read_options, mmap.datafile() + offset, mmap.filesize() - offset, &entry_header, &size_header); if ( !s.IsOK() || !entry_header.AreSizesValid(offset, mmap.filesize())) { // End of file during recovery, thus breaking out of the while-loop break; } // NOTE: The checksum is verified only for uncompacted files, because this // is when an entry can be invalid due to transfer or write issues. // For compacted files and during the compaction process, it does not // matter whether or not the entry is valid. The user will know that // an entry is invalid after doing a Get(), and that is his choice to do a // Delete() if he wants to delete the entry. Keep in mind though that if // the checksum is wrong, it's possible for the hashedkey to be // erroneous, in which case the only way to find and remove invalid // entries is to iterate over whole database, and do Delete() commands // for the entries with invalid checksums. bool do_checksum_verification = entry_header.IsUncompacted() ? true : false; bool is_crc32_valid = true; if (do_checksum_verification) { crc32_.ResetThreadLocalStorage(); crc32_.stream(mmap.datafile() + offset + 5, size_header + entry_header.size_key + entry_header.size_value_used() - 5); is_crc32_valid = (entry_header.checksum_content == crc32_.get()); } if (!do_checksum_verification || is_crc32_valid) { // Valid content, add to index offarray_current.push_back(std::pair(entry_header.hash, offset)); uint64_t fileid_shifted = fileid; fileid_shifted <<= 32; index_se.insert(std::pair(entry_header.hash, fileid_shifted | offset)); } else { has_invalid_entries = true; } if (entry_header.HasPadding()) has_padding_in_values = true; offset += size_header + entry_header.size_key + entry_header.size_value_offset(); log::trace("HSTableManager::RecoverFile", "Scanned hash [%" PRIu64 "], next offset [%" PRIu64 "] - CRC32:%s stored=0x%08x computed=0x%08x", entry_header.hash, offset, do_checksum_verification ? (is_crc32_valid?"OK":"ERROR") : "UNKNOWN", entry_header.checksum_content, crc32_.get()); } // 3. Write a new index at the end of the file with whatever entries could be save if (offset > db_options_.internal__hstable_header_size) { mmap.Close(); int fd; if ((fd = open(mmap.filepath(), O_WRONLY, 0644)) < 0) { log::emerg("HSTableManager::RecoverFile()", "Could not open file [%s]: %s", mmap.filepath(), strerror(errno)); return Status::IOError("Could not open file for recovery", mmap.filepath()); } if (ftruncate(fd, offset) < 0) { return Status::IOError("HSTableManager::RecoverFile()", strerror(errno)); } uint64_t size_offarray; Status s = WriteOffsetArray(fd, offarray_current, &size_offarray, hstheader.GetFileType(), has_padding_in_values, has_invalid_entries); if (!s.IsOK()) return s; file_resource_manager.SetFileSize(fileid, mmap.filesize() + size_offarray); close(fd); } else { return Status::IOError("Could not recover file"); } return Status::OK(); } Status DeleteAllLockedFiles(std::string& dbname) { std::set fileids; DIR *directory; struct dirent *entry; if ((directory = opendir(dirpath_locks_.c_str())) == NULL) { return Status::IOError("Could not open lock directory", dirpath_locks_.c_str()); } uint32_t fileid = 0; while ((entry = readdir(directory)) != NULL) { if (strncmp(entry->d_name, ".", 1) == 0) continue; fileid = HSTableManager::hex_to_num(entry->d_name); fileids.insert(fileid); } closedir(directory); for (auto& fileid: fileids) { if (std::remove(GetFilepath(fileid).c_str()) != 0) { log::emerg("DeleteAllLockedFiles()", "Could not remove data file [%s]", GetFilepath(fileid).c_str()); } } return Status::OK(); } uint64_t static get_magic_number() { return 0x4d454f57; } private: // Options DatabaseOptions db_options_; Hash *hash_; bool is_read_only_; bool is_closed_; FileType filetype_default_; std::mutex mutex_close_; uint32_t fileid_; uint32_t sequence_fileid_; std::mutex mutex_sequence_fileid_; uint64_t timestamp_; uint64_t sequence_timestamp_; std::mutex mutex_sequence_timestamp_; bool is_locked_sequence_timestamp_; uint64_t size_block_; bool has_file_; bool has_sync_option_; int fd_; std::string filepath_; uint64_t offset_start_; uint64_t offset_end_; std::string dbname_; char *buffer_raw_; char *buffer_index_; bool buffer_has_items_; kdb::CRC32 crc32_; std::string prefix_; std::string prefix_compaction_; std::string dirpath_locks_; bool wait_until_can_open_new_files_; public: FileResourceManager file_resource_manager; // key_to_location is made to be dependent on the id of the thread that // originated an order, so that if two writers simultaneously write entries // with the same key, they will be properly stored into separate locations. // NOTE: is it possible for a part to arrive when the file is not yet // created, and have it's WriteMiddleOrLastPart() fail because of that? // If so, need to write in buffer_raw_ instead // TODO-37: if a thread crashes or terminates, its data will *not* be cleaned up. std::map< std::thread::id, std::map > key_to_location; std::map< std::thread::id, std::map > key_to_headersize; }; } // namespace kdb #endif // KINGDB_HSTABLE_MANAGER_H_ ================================================ FILE: storage/resource_manager.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_RESOURCE_MANAGER_H_ #define KINGDB_RESOURCE_MANAGER_H_ #include "util/debug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util/byte_array.h" #include "util/file.h" #include "util/options.h" #include "algorithm/crc32c.h" #include "algorithm/hash.h" namespace kdb { class FileResourceManager { public: FileResourceManager() { Reset(); } void Reset() { std::unique_lock lock(mutex_); dbsize_total_ = 0; dbsize_uncompacted_ = 0; filesizes_.clear(); largefiles_.clear(); compactedfiles_.clear(); num_writes_in_progress_.clear(); offarrays_.clear(); has_padding_in_values_.clear(); epoch_last_activity_.clear(); } void ClearTemporaryDataForFileId(uint32_t fileid) { std::unique_lock lock(mutex_); num_writes_in_progress_.erase(fileid); offarrays_.erase(fileid); has_padding_in_values_.erase(fileid); epoch_last_activity_.erase(fileid); } void ClearAllDataForFileId(uint32_t fileid) { ClearTemporaryDataForFileId(fileid); std::unique_lock lock(mutex_); uint64_t filesize = 0; if (filesizes_.find(fileid) != filesizes_.end()) { filesize = filesizes_[fileid]; } IncrementDbSizeTotal(-filesize); if (compactedfiles_.find(fileid) == compactedfiles_.end()) { IncrementDbSizeUncompacted(-filesize); } filesizes_.erase(fileid); largefiles_.erase(fileid); compactedfiles_.erase(fileid); } uint64_t GetFileSize(uint32_t fileid) { std::unique_lock lock(mutex_); return filesizes_[fileid]; } void SetFileSize(uint32_t fileid, uint64_t filesize) { std::unique_lock lock(mutex_); uint64_t filesize_before = 0; if (filesizes_.find(fileid) != filesizes_.end()) { filesize_before = filesizes_[fileid]; } IncrementDbSizeTotal(filesize - filesize_before); if (compactedfiles_.find(fileid) == compactedfiles_.end()) { IncrementDbSizeUncompacted(filesize - filesize_before); } filesizes_[fileid] = filesize; } bool IsFileLarge(uint32_t fileid) { std::unique_lock lock(mutex_); return (largefiles_.find(fileid) != largefiles_.end()); } void SetFileLarge(uint32_t fileid) { mutex_.lock(); largefiles_.insert(fileid); mutex_.unlock(); SetFileCompacted(fileid); } bool IsFileCompacted(uint32_t fileid) { std::unique_lock lock(mutex_); return (compactedfiles_.find(fileid) != compactedfiles_.end()); } void SetFileCompacted(uint32_t fileid) { // NOTE: the compacted files are all the ones before the fileid at which the // compaction process is currently waiting. Thus technically, there is no // need for a std::set to know which HSTables are compacted and which // aren't. This could be optimized at some point. std::unique_lock lock(mutex_); if (compactedfiles_.find(fileid) != compactedfiles_.end()) return; compactedfiles_.insert(fileid); if (filesizes_.find(fileid) != filesizes_.end()) { // The size for this file was already set, thus the size of uncompacted // files needs to be updated. IncrementDbSizeUncompacted(-filesizes_[fileid]); } } uint32_t GetNumWritesInProgress(uint32_t fileid) { std::unique_lock lock(mutex_); return num_writes_in_progress_[fileid]; } uint32_t SetNumWritesInProgress(uint32_t fileid, int inc) { // The number of writers to a specific file is being tracked so that if a // file is flushed but is still being written to due to some multipart // entry, we don't write the footer yet. That way, if any crash happens, // the file will have no footer, which will force a recovery and discover // which entries have corrupted data. std::unique_lock lock(mutex_); if (num_writes_in_progress_.find(fileid) == num_writes_in_progress_.end()) { num_writes_in_progress_[fileid] = 0; } num_writes_in_progress_[fileid] += inc; epoch_last_activity_[fileid] = GetEpochNow(); return num_writes_in_progress_[fileid]; } uint64_t GetEpochNow() { // Returns epoch in milliseconds struct timeval tv; gettimeofday(&tv, NULL); uint64_t epoch = (uint64_t)(tv.tv_sec) * 1000 + (uint64_t)(tv.tv_usec) / 1000; return epoch; } uint64_t GetEpochLastActivity(uint32_t fileid) { std::unique_lock lock(mutex_); return epoch_last_activity_[fileid]; } const std::vector< std::pair > GetOffsetArray(uint32_t fileid) { return offarrays_[fileid]; } void AddOffsetArray(uint32_t fileid, std::pair p) { offarrays_[fileid].push_back(p); } bool HasPaddingInValues(uint32_t fileid) { std::unique_lock lock(mutex_); return (has_padding_in_values_.find(fileid) != has_padding_in_values_.end()); } void SetHasPaddingInValues(uint32_t fileid, bool flag) { std::unique_lock lock(mutex_); if (flag) { has_padding_in_values_.insert(fileid); } else { has_padding_in_values_.erase(fileid); } } uint64_t GetDbSizeTotal() { std::unique_lock lock(mutex_dbsize_); return dbsize_total_; } uint64_t GetDbSizeUncompacted() { std::unique_lock lock(mutex_dbsize_); return dbsize_uncompacted_; } void IncrementDbSizeTotal(int64_t inc) { std::unique_lock lock(mutex_dbsize_); assert(dbsize_total_ + inc >= 0); dbsize_total_ += inc; } void IncrementDbSizeUncompacted(int64_t inc) { std::unique_lock lock(mutex_dbsize_); assert(dbsize_uncompacted_ + inc >= 0); dbsize_uncompacted_ += inc; } private: // NOTE: all files go through the same mutexes -- this can easily be sharded std::mutex mutex_; std::mutex mutex_dbsize_; std::map filesizes_; std::set largefiles_; std::set compactedfiles_; std::map num_writes_in_progress_; std::map > > offarrays_; std::set has_padding_in_values_; std::map epoch_last_activity_; uint64_t dbsize_total_; uint64_t dbsize_uncompacted_; }; } // namespace kdb #endif // KINGDB_RESOURCE_MANAGER_H_ ================================================ FILE: storage/storage_engine.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_STORAGE_ENGINE_H_ #define KINGDB_STORAGE_ENGINE_H_ #include "util/debug.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "util/version.h" #include "util/options.h" #include "util/order.h" #include "util/filepool.h" #include "util/byte_array.h" #include "util/file.h" #include "algorithm/crc32c.h" #include "algorithm/hash.h" #include "storage/format.h" #include "storage/resource_manager.h" #include "storage/hstable_manager.h" #include "thread/event_manager.h" namespace kdb { class StorageEngine { friend class RegularIterator; friend class SequentialIterator; public: StorageEngine(DatabaseOptions db_options, EventManager *event_manager, std::string dbname, bool read_only=false, // TODO: this should be part of db_options -- sure about that? what options are stored on disk? std::set* fileids_ignore=nullptr, uint32_t fileid_end=0) : db_options_(db_options), event_manager_(event_manager), is_read_only_(read_only), prefix_compaction_("compaction_"), dirpath_locks_(dbname + "/locks"), hstable_manager_(db_options, dbname, "", prefix_compaction_, dirpath_locks_, kUncompactedRegularType, read_only), hstable_manager_compaction_(db_options, dbname, prefix_compaction_, prefix_compaction_, dirpath_locks_, kCompactedRegularType, read_only) { log::trace("StorageEngine:StorageEngine()", "dbname: %s", dbname.c_str()); dbname_ = dbname; fileids_ignore_ = fileids_ignore; num_readers_ = 0; is_compaction_in_progress_ = false; force_compaction_ = false; sequence_snapshot_ = 0; stop_requested_ = false; is_closed_ = false; fs_free_space_ = db_options_.storage__minimum_free_space_accept_orders; file_manager_ = std::make_shared(); if (!is_read_only_) { thread_index_ = std::thread(&StorageEngine::ProcessingLoopIndex, this); thread_data_ = std::thread(&StorageEngine::ProcessingLoopData, this); thread_compaction_ = std::thread(&StorageEngine::ProcessingLoopCompaction, this); thread_statistics_ = std::thread(&StorageEngine::ProcessingLoopStatistics, this); } hash_ = MakeHash(db_options.hash); if (!is_read_only_) { fileids_iterator_ = nullptr; } else { fileids_iterator_ = new std::vector(); } Status s = hstable_manager_.LoadDatabase(dbname, index_, fileids_ignore_, fileid_end, fileids_iterator_); if (!s.IsOK()) { log::emerg("StorageEngine", "Could not load database: [%s]", s.ToString().c_str()); Close(); } } ~StorageEngine() {} static std::string GetCompactionFilePrefix() { return "compaction_"; } void Close() { std::unique_lock lock(mutex_close_); if (is_closed_) return; is_closed_ = true; // Wait for readers to exit AcquireWriteLock(); hstable_manager_.Close(); Stop(); ReleaseWriteLock(); if (!is_read_only_) { log::trace("StorageEngine::Close()", "join start"); event_manager_->update_index.NotifyWait(); // notifies ProcessingLoopIndex() event_manager_->flush_buffer.NotifyWait(); // notifies ProcessingLoopData() cv_statistics_.notify_all(); // notifies ProcessingLoopStatistics() cv_loop_compaction_.notify_all(); // notifies ProcessingLoopCompaction() thread_index_.join(); thread_data_.join(); thread_compaction_.join(); thread_statistics_.join(); Status s = ReleaseAllSnapshots(); if (!s.IsOK()) { log::emerg("StorageEngine::Close()", s.ToString().c_str()); } log::trace("StorageEngine::Close()", "join end"); } if (fileids_ignore_ != nullptr) { delete fileids_ignore_; } if (fileids_iterator_ != nullptr) { delete fileids_iterator_; } delete hash_; log::trace("StorageEngine::Close()", "done"); } bool IsStopRequested() { return stop_requested_; } void Stop() { stop_requested_ = true; } void ProcessingLoopStatistics() { std::chrono::milliseconds duration(db_options_.storage__statistics_polling_interval); while (true) { std::unique_lock lock(mutex_statistics_); fs_free_space_ = FileUtil::fs_free_space(dbname_.c_str()); cv_statistics_.wait_for(lock, duration); if (IsStopRequested()) return; } } uint64_t GetFreeSpace() { std::unique_lock lock(mutex_statistics_); return fs_free_space_; } Status FileSystemStatus() { if (GetFreeSpace() < db_options_.storage__minimum_free_space_accept_orders) { return Status::IOError("Not enough free space on the file system"); } else if (!hstable_manager_.CanOpenNewFiles()) { return Status::IOError("Cannot open new files"); } return Status::OK(); } void ProcessingLoopCompaction() { // 1. Have a ProcessingLoopStatistics() which pull the disk usage and // dbsize values every 'db.compaction.check_interval' milliseconds. // 2. 'fileid_lastcompacted' hold the id of the last hstable that was // successfully compacted. // 3. If the free disk space is > db.compaction.filesystem.survival_mode_threshold // mode = normal // batch_size = db.compaction.filesystem.normal_batch_size // else // mode = survival // batch_size = db.compaction.filesystem.survival_batch_size // // If free disk space > db.compaction.filesystem.free_space_required // && // sum all uncompacted files > batch_size // Do compaction by going to step 4 // else: // Sleep for a duration of db.compaction.check_interval // Go to step 3 // 4. Initialize: M = M_default // 5. In compaction: // - Scan through uncompacted files until the sum of their sizes reaches M // - Try reserving the disk space necessary for the compaction process // - If compaction fails, do M <- M/2 and goto step 5 // - If M reaches 0, try one compaction run (trying to clear the large // files if any), and if still unsuccessful, declare compaction impossible // 6. If the compaction succeeded, update 'fileid_lastcompacted' std::chrono::milliseconds duration(db_options_.internal__compaction_check_interval); uint32_t fileid_lastcompacted = 0; uint32_t fileid_out = 0; int num_loops_without_compaction = 0; int num_loops_without_compaction_max = db_options_.compaction__force_interval / db_options_.internal__compaction_check_interval; while (true) { num_loops_without_compaction += 1; uint64_t size_compaction = 0; uint64_t fs_free_space = GetFreeSpace(); if (fs_free_space > db_options_.compaction__filesystem__survival_mode_threshold) { size_compaction = db_options_.compaction__filesystem__normal_batch_size; } else { size_compaction = db_options_.compaction__filesystem__survival_batch_size; } // Only files that are no longer taking incoming updates can be compacted uint32_t fileid_end = hstable_manager_.GetHighestStableFileId(fileid_lastcompacted + 1); uint64_t dbsize_uncompacted = hstable_manager_.file_resource_manager.GetDbSizeUncompacted(); log::trace("ProcessingLoopCompaction", "fileid_end:%u fs_free_space:%" PRIu64 " compaction.filesystem.free_space_required:%" PRIu64 " size_compaction:%" PRIu64 " dbsize_uncompacted:%" PRIu64, fileid_end, fs_free_space, db_options_.compaction__filesystem__free_space_required, size_compaction, dbsize_uncompacted); if ( fileid_end > 0 && fs_free_space > db_options_.compaction__filesystem__free_space_required && ( dbsize_uncompacted > size_compaction || force_compaction_ || (num_loops_without_compaction_max > 0 && num_loops_without_compaction >= num_loops_without_compaction_max) ) ) { num_loops_without_compaction = 0; while (true) { fileid_out = 0; Status s = Compaction(dbname_, fileid_lastcompacted + 1, fileid_end, size_compaction, &fileid_out); if (!s.IsOK()) { if (size_compaction == 0) break; size_compaction /= 2; } else { if (fileid_out > 0) fileid_lastcompacted = fileid_out; if (force_compaction_) { int has_compacted_all_files = fileid_out == 0 ? 1 : 0; log::trace("ProcessingLoopCompaction", "Compaction() OK - has_compacted_all_files: %d - fileid_last:%u fileid_end:%u\n", has_compacted_all_files, fileid_lastcompacted, fileid_end); if (!has_compacted_all_files) continue; } break; } if (IsStopRequested()) return; } } if (force_compaction_) { // Whether the forced compaction worked or not, success is sent to the // requesting method so it can be unblocked. int has_compacted_all_files = 1; event_manager_->compaction_status.StartAndBlockUntilDone(has_compacted_all_files); } std::unique_lock lock(mutex_loop_compaction_); cv_loop_compaction_.wait_for(lock, duration); if (IsStopRequested()) return; } } void ProcessingLoopData() { // NOTE: Write failures are handled as such: // - If the entry is small, there is no failure handling: the entry either // is already full, or it didn't make it that far in the data pipe. // - If the entry is a multipart entry or a large entry, it is possible that // a part doesn't make it to the Storage Engine for any reason: network // issue, storage failure, etc. In that case, the Offset Array for the // HSTable of the entry will not be written by the Storage Engine, and // the entry will not be saved in the index either. // - The compaction thread goes over all recent HSTables regularly to find // the highest stable file id, by calling GetHighestStableFileId(). This // method also cleans up all HSTables that have timed out: // * The temporary data kept for them in the FileResourceManager // to avoid memory leaks. // * If the file is a regular HSTable, the OffsetArray is written, without // the erroneous entries in it. That way, during the compaction process, // the invalid entry won't be found in the Offset Array and will simply // be ignored, effectively reclaiming the stoarge space it used. // * If the file is a large HSTable, it is simply deleted: there is no // point keeping a large timed out HSTable with invalid data anyway. // - If for any reason the database crashes, and the OffsetArray for an // HSTable has not been written to disk yet: // * If the file is uncompacted, the checksums of all entries are verified, // and entries with invalid checksums are not added to the recovered // OffsetArray, which will enable the subsequent compaction process // to reclaim the storage space for that invalid entry. // * If the file is compacted, the checksums for that HSTable are not // verified: it is the responsibility of the user to decide, when doing // a Get(), whether or not he wants to verify the checksum, and if it // is invalid, whether or not to Delete() the entry. // TODO-41: Not all cases of Offset Array failures are handled with the current // implementation: fix that. while(true) { // Wait for orders to process log::trace("StorageEngine::ProcessingLoopData()", "start"); std::vector orders = event_manager_->flush_buffer.Wait(); if (IsStopRequested()) return; log::trace("StorageEngine::ProcessingLoopData()", "got %d orders", orders.size()); // Process orders, and create update map for the index AcquireWriteLock(); std::multimap map_index; hstable_manager_.WriteOrdersAndFlushFile(orders, map_index); ReleaseWriteLock(); event_manager_->flush_buffer.Done(); event_manager_->update_index.StartAndBlockUntilDone(map_index); } } void ProcessingLoopIndex() { while(true) { log::trace("StorageEngine::ProcessingLoopIndex()", "start"); std::multimap index_updates = event_manager_->update_index.Wait(); if (IsStopRequested()) return; log::trace("StorageEngine::ProcessingLoopIndex()", "got index_updates: %d updates", index_updates.size()); /* for (auto& p: index_updates) { if (p.second == 0) { log::trace("StorageEngine::ProcessingLoopIndex()", "remove [%s] num_items_index [%d]", p.first.c_str(), index_.size()); index_.erase(p.first); } else { log::trace("StorageEngine::ProcessingLoopIndex()", "put [%s]", p.first.c_str()); index_[p.first] = p.second; } } */ std::multimap *index; mutex_compaction_.lock(); if (is_compaction_in_progress_) { index = &index_compaction_; } else { index = &index_; } mutex_compaction_.unlock(); int num_iterations_per_lock = db_options_.internal__num_iterations_per_lock; int counter_iterations = 0; for (auto& p: index_updates) { if (counter_iterations == 0) { AcquireWriteLock(); } counter_iterations += 1; //uint64_t hashed_key = hash_->HashFunction(p.first.c_str(), p.first.size()); //log::trace("StorageEngine::ProcessingLoopIndex()", "hash [%" PRIu64 "] location [%" PRIu64 "]", p.first, p.second); //mutex_index_.lock(); index->insert(std::pair(p.first, p.second)); //mutex_index_.unlock(); // Throttling the index updates, and allows other processes // to acquire the write lock if they need it if (counter_iterations >= num_iterations_per_lock) { ReleaseWriteLock(); counter_iterations = 0; } } if (counter_iterations) ReleaseWriteLock(); /* for (auto& p: index_) { log::trace("index_", "hash:[0x%08x] location:[%" PRIu64 "]", p.first, p.second); } */ event_manager_->update_index.Done(); log::trace("StorageEngine::ProcessingLoopIndex()", "done"); int temp = 1; event_manager_->clear_buffer.StartAndBlockUntilDone(temp); } } Status Get(ReadOptions& read_options, ByteArray& key, ByteArray* value_out, uint64_t *location_out=nullptr) { // NOTE: There is no monitoring or reference counting for the file used by // a ByteArray. If the user has a ByteArray pointing to a file, and // this file is deleted by the compaction process, this could lead // to serious issues. // Luckily, in Linux and BSD, the kernel counts open file descriptors // for each file, and files are really deleted only when all the // descriptors are closed. Therefore, it is fine to delete files that // are open: their content will remain available to the file // descriptor holder, and the storage space will be reclaimed when the // file descriptor is closed. mutex_write_.lock(); mutex_read_.lock(); num_readers_ += 1; mutex_read_.unlock(); mutex_write_.unlock(); bool has_compaction_index = false; mutex_compaction_.lock(); has_compaction_index = is_compaction_in_progress_; mutex_compaction_.unlock(); Status s; if (!has_compaction_index) { s = GetWithIndex(read_options, index_, key, value_out, location_out); } else { s = GetWithIndex(read_options, index_compaction_, key, value_out, location_out); if (!s.IsOK() && !s.IsDeleteOrder()) { s = GetWithIndex(read_options, index_, key, value_out, location_out); } } mutex_read_.lock(); num_readers_ -= 1; log::trace("Get()", "num_readers_: %d", num_readers_); mutex_read_.unlock(); cv_read_.notify_one(); return s; } Status GetWithIndex(ReadOptions& read_options, std::multimap& index, ByteArray& key, ByteArray* value_out, uint64_t *location_out=nullptr) { //std::unique_lock lock(mutex_index_); // TODO-26: should not be locking here, instead, should store the hashed key // and location from the index and release the lock right away -- should not // be locking while calling GetEntry() // NOTE: Since C++11, the relative ordering of elements with equivalent keys // in a multimap is preserved. uint64_t hashed_key = hash_->HashFunction(key.data(), key.size()); //log::trace("StorageEngine::GetWithIndex()", "num entries in index:[%d] content:[%s] size:[%d] hashed_key:[0x%" PRIx64 "]", index.size(), key.ToString().c_str(), key.size(), hashed_key); auto range = index.equal_range(hashed_key); if (range.first != range.second) { auto it = --range.second; do { ByteArray key_temp; Status s = GetEntry(read_options, it->second, &key_temp, value_out); //log::trace("StorageEngine::GetWithIndex()", "key:[%s] key_temp:[%s] hashed_key:[0x%" PRIx64 "] hashed_key_temp:[0x%" PRIx64 "] size_key:[%" PRIu64 "] size_key_temp:[%" PRIu64 "]", key.ToString().c_str(), key_temp.ToString().c_str(), hashed_key, it->first, key.size(), key_temp.size()); if ((s.IsOK() || s.IsDeleteOrder()) && key_temp == key) { //log::trace("StorageEngine::GetWithIndex()", "Entry [%s] found at location: 0x%08" PRIx64, key.ToString().c_str(), it->second); if (location_out != nullptr) *location_out = it->second; return s; } --it; } while(it != range.first); } //log::trace("StorageEngine::GetWithIndex()", "%s - not found!", key.ToString().c_str()); return Status::NotFound("Unable to find the entry in the storage engine"); } Status GetEntry(ReadOptions& read_options, uint64_t location, ByteArray* key_out, ByteArray* value_out) { log::trace("StorageEngine::GetEntry()", "start"); Status s = Status::OK(); // TODO: check that the offset falls into the // size of the file, just in case a file was truncated but the index // still had a pointer to an entry in at an invalid location -- // alternatively, we could just let the host program crash, to force a restart // which would rebuild the index properly uint32_t fileid = (location & 0xFFFFFFFF00000000) >> 32; uint32_t offset_file = location & 0x00000000FFFFFFFF; uint64_t filesize = 0; filesize = hstable_manager_.file_resource_manager.GetFileSize(fileid); //log::trace("StorageEngine::GetEntry()", "location:%" PRIu64 " fileid:%u offset_file:%u filesize:%" PRIu64, location, fileid, offset_file, filesize); std::string filepath = hstable_manager_.GetFilepath(fileid); // TODO: optimize here //ByteArray key_temp = NewMmappedByteArray(filepath, filesize); ByteArray key_temp = ByteArray::NewPooledByteArray(file_manager_, fileid, filepath, filesize); ByteArray value_temp = key_temp; // NOTE: verify that value_temp.size() is indeed filesize -- verified and // the size was 0: should the size of an mmapped byte array be the size of // the file by default? struct EntryHeader entry_header; uint32_t size_header; s = EntryHeader::DecodeFrom(db_options_, read_options, value_temp.data() + offset_file, filesize - offset_file, &entry_header, &size_header); if (!s.IsOK()) return s; if ( !entry_header.AreSizesValid(offset_file, filesize) || !entry_header.IsEntryFull()) { entry_header.print(); return Status::IOError("Entry has invalid header"); } if (read_options.verify_checksums) { uint32_t checksum_key = crc32c::Value(value_temp.data() + offset_file + size_header, entry_header.size_key); value_temp.set_checksum_initial(checksum_key); } key_temp.set_offset(offset_file + size_header); key_temp.set_size(entry_header.size_key); value_temp.set_offset(offset_file + size_header + entry_header.size_key); value_temp.set_size(entry_header.size_value); value_temp.set_size_compressed(entry_header.size_value_compressed); value_temp.set_checksum(entry_header.checksum_content); //PrintHex(value_temp.data(), 16); if (entry_header.IsTypeDelete()) { s = Status::DeleteOrder(); } log::debug("StorageEngine::GetEntry()", "mmap() out - type remove:%d", entry_header.IsTypeDelete()); log::trace("StorageEngine::GetEntry()", "Sizes: key_temp:%" PRIu64 " value_temp:%" PRIu64 " size_value_compressed:%" PRIu64 " filesize:%" PRIu64, key_temp.size(), value_temp.size(), value_temp.size_compressed(), filesize); *key_out = key_temp; *value_out = value_temp; return s; } bool IsLocationLastInIndex(uint64_t location, ByteArray& key) { // Only ever called by a Snapshot, thus no need to lock anything with // mutexes. uint64_t hashed_key = hash_->HashFunction(key.data(), key.size()); bool is_last = false; auto it = index_.upper_bound(hashed_key); --it; if (it->second == location) is_last = true; return is_last; } bool IsFileLarge(uint32_t fileid) { return hstable_manager_.file_resource_manager.IsFileLarge(fileid); } Status Compaction(std::string dbname, uint32_t fileid_start, uint32_t fileid_end_target, uint64_t size_compaction, uint32_t *fileid_out) { // NOTE: Depending on what has to be compacted, Compaction() can take a // long time. Therefore IsStopRequested() is called at the end // of each major step to allow the method to exit in case a stop // was requested. // TODO: make sure that all sets, maps and multimaps are cleared whenever // they are no longer needed // TODO: when compaction starts, open() a file and lseek() to reserve disk // space -- or write a bunch of files with the "compaction_" prefix // that will be overwritten when the compacted files are written. // TODO: add a new flag in files that says "compacted" or "regular", and before // starting any compaction process, select only regular files, ignore // compacted ones. (large files are 'compacted' by default). // TODO-23: replace the change on is_compaction_in_progress_ by a RAII // WARNING: this is not the only part of the code with this issue, // some code digging in all files is required mutex_compaction_.lock(); is_compaction_in_progress_ = true; mutex_compaction_.unlock(); // TODO: If is_compaction_in_progress_ is set to true and then the method // return due to an error, then if it is established that the // compaction process really failed, index_compaction_ needs to be // poured back into index_, and is_compaction_in_progress_ needs to be // set to false. // Before the compaction starts, make sure all compaction-related files are removed Status s; s = FileUtil::remove_files_with_prefix(dbname.c_str(), prefix_compaction_); if (!s.IsOK()) return Status::IOError("Could not clean up previous compaction", dbname.c_str()); // 1a. Get *all* the files that are candidates for compaction // TODO: This is a quick hack to get the files for compaction, by going // through all the files. Fix that to be only the latest non-handled // uncompacted files log::trace("Compaction()", "Step 1: Get files between fileids %u and %u", fileid_start, fileid_end_target); std::multimap index_compaction; DIR *directory; struct dirent *entry; if ((directory = opendir(dbname.c_str())) == NULL) { return Status::IOError("Could not open database directory", dbname.c_str()); } if (IsStopRequested()) return Status::IOError("Stop was requested"); std::map fileids_to_filesizes; char filepath[FileUtil::maximum_path_size()]; uint32_t fileid = 0; struct stat info; while ((entry = readdir(directory)) != NULL) { if (strcmp(entry->d_name, DatabaseOptions::GetFilename().c_str()) == 0) continue; int ret = snprintf(filepath, FileUtil::maximum_path_size(), "%s/%s", dbname.c_str(), entry->d_name); if (ret < 0 || ret >= FileUtil::maximum_path_size()) { log::emerg("Compaction()", "Filepath buffer is too small, could not build the filepath string for file [%s]", entry->d_name); continue; } fileid = HSTableManager::hex_to_num(entry->d_name); if ( hstable_manager_.file_resource_manager.IsFileCompacted(fileid) || stat(filepath, &info) != 0 || !(info.st_mode & S_IFREG) || fileid < fileid_start || fileid > fileid_end_target || info.st_size <= (off_t)db_options_.internal__hstable_header_size) { continue; } fileids_to_filesizes[fileid] = info.st_size; } closedir(directory); // 1b. Filter to process files only up to a certain total size // (large files are ignored) uint32_t fileid_end_actual = 0; uint64_t size_total = 0; for (auto& p: fileids_to_filesizes) { // NOTE: Here the locations are read directly from the secondary storage, // which could be optimized by reading them from the index in memory. // One way to do that is to have a temporary index to which all // updates are synced during compaction. That way, the main index is // guaranteed to not be changed, thus all sorts of scans and changes // can be done on it. Once compaction is over, the temporary index // can just be poured into the main index. uint32_t fileid = p.first; uint64_t filesize = p.second; if (!IsFileLarge(fileid) && size_total + filesize > size_compaction) break; fileid_end_actual = fileid; *fileid_out = fileid; std::string filepath = hstable_manager_.GetFilepath(fileid); Mmap mmap(filepath, filesize); if (!mmap.is_valid()) return Status::IOError("Mmap constructor failed"); s = hstable_manager_.LoadFile(mmap, fileid, index_compaction); if (!s.IsOK()) { log::warn("HSTableManager::Compaction()", "Could not load index in file [%s]", filepath.c_str()); // TODO: handle the case where a file is found to be damaged during compaction } size_total += filesize; } fileids_to_filesizes.clear(); // no longer needed if (IsStopRequested()) return Status::IOError("Stop was requested"); // 2. Iterating over all unique hashed keys of index_compaction, and determine which // locations of the storage engine index 'index_' with similar hashes will need to be compacted. log::trace("Compaction()", "Step 2: Get unique hashed keys"); std::vector> index_compaction_se; for (auto it = index_compaction.begin(); it != index_compaction.end(); it = index_compaction.upper_bound(it->first)) { auto range = index_.equal_range(it->first); for (auto it_se = range.first; it_se != range.second; ++it_se) { index_compaction_se.push_back(*it_se); } } index_compaction.clear(); // no longer needed if (IsStopRequested()) return Status::IOError("Stop was requested"); // 3. For each entry, determine which location has to be kept, which has to be deleted, // and the overall set of file ids that needs to be compacted log::trace("Compaction()", "Step 3: Determine locations"); std::set locations_delete; std::set fileids_compaction; std::set fileids_largefiles_keep; std::set keys_encountered; std::multimap hashedkeys_to_locations_regular_keep; std::multimap hashedkeys_to_locations_large_keep; // Reversing the order of the vector to guarantee that // the most recent locations are treated first std::reverse(index_compaction_se.begin(), index_compaction_se.end()); ReadOptions read_options; for (auto &p: index_compaction_se) { ByteArray key, value; uint64_t& location = p.second; uint32_t fileid = (location & 0xFFFFFFFF00000000) >> 32; if (fileid > fileid_end_actual) { // Make sure that files added after the compacted // files or during the compaction itself are not used continue; } fileids_compaction.insert(fileid); Status s = GetEntry(read_options, location, &key, &value); std::string str_key = key.ToString(); // For any given key, only the first occurrence, which is the most recent one, // has to be kept. The other ones will be deleted. If the first occurrence // is a Delete Order, then all occurrences of that key will be deleted. if (keys_encountered.find(str_key) == keys_encountered.end()) { keys_encountered.insert(str_key); if (IsFileLarge(fileid)) { hashedkeys_to_locations_large_keep.insert(p); fileids_largefiles_keep.insert(fileid); } else if (!s.IsDeleteOrder()) { hashedkeys_to_locations_regular_keep.insert(p); } else { locations_delete.insert(location); } } else { locations_delete.insert(location); } } index_compaction_se.clear(); // no longer needed keys_encountered.clear(); // no longer needed if (IsStopRequested()) return Status::IOError("Stop was requested"); // 4. Building the clusters of locations, indexed by the smallest location // per cluster. All the non-smallest locations are stored as secondary // locations. Only regular entries are used: it would not make sense // to compact large entries anyway. log::trace("Compaction()", "Step 4: Building clusters"); std::map> hashedkeys_clusters; std::set locations_secondary; for (auto it = hashedkeys_to_locations_regular_keep.begin(); it != hashedkeys_to_locations_regular_keep.end(); it = hashedkeys_to_locations_regular_keep.upper_bound(it->first)) { auto range = hashedkeys_to_locations_regular_keep.equal_range(it->first); std::vector locations; for (auto it_bucket = range.first; it_bucket != range.second; ++it_bucket) { log::trace("Compaction()", "Building clusters - location:%" PRIu64, it->second); locations.push_back(it->second); } std::sort(locations.begin(), locations.end()); hashedkeys_clusters[locations[0]] = locations; for (size_t i = 1; i < locations.size(); i++) { locations_secondary.insert(locations[i]); } } hashedkeys_to_locations_regular_keep.clear(); if (IsStopRequested()) return Status::IOError("Stop was requested"); /* * The compaction needs the following collections: * * - fileids_compaction: fileids of all files on which compaction must operate * set * * - fileids_largefiles_keep: set of fileids that contain large items that must be kept * set * * - hashedkeys_clusters: clusters of locations having same hashed keys, * sorted by ascending order of hashed keys and indexed by the smallest * location. * map> * * - locations_secondary: locations of all entries to keep * set * * - locations_delete: locations of all entries to delete * set * */ // 5a. Reserving space in the file system // Reserve as much space are the files to compact are using, this is a // poor approximation, but should cover most cases. Large files are ignored. uint32_t fileid_compaction = 1; for (auto it = fileids_compaction.begin(); it != fileids_compaction.end(); ++it) { uint32_t fileid = *it; if (IsFileLarge(fileid)) continue; uint64_t filesize = hstable_manager_.file_resource_manager.GetFileSize(fileid); std::string filepath = hstable_manager_compaction_.GetFilepath(fileid_compaction); Status s = FileUtil::fallocate_filepath(filepath, filesize); if (!s.IsOK()) { // TODO: the cleanup of the compaction (removals, etc.) should be // mutualized in the processing loop FileUtil::remove_files_with_prefix(dbname.c_str(), prefix_compaction_); return s; } fileid_compaction += 1; } if (IsStopRequested()) return Status::IOError("Stop was requested"); // 5b. Mmapping all the files involved in the compaction log::trace("Compaction()", "Step 5: Mmap() all the files! ALL THE FILES!"); std::map mmaps; for (auto it = fileids_compaction.begin(); it != fileids_compaction.end(); ++it) { uint32_t fileid = *it; if (fileids_largefiles_keep.find(fileid) != fileids_largefiles_keep.end()) continue; struct stat info; std::string filepath = hstable_manager_.GetFilepath(fileid); if (stat(filepath.c_str(), &info) != 0 || !(info.st_mode & S_IFREG)) { log::emerg("Compaction()", "Error during compaction with file [%s]", filepath.c_str()); } Mmap *mmap = new Mmap(filepath.c_str(), info.st_size); if (!mmap->is_valid()) { delete mmap; continue; } mmaps[fileid] = mmap; } if (IsStopRequested()) return Status::IOError("Stop was requested"); // 6. Now building a vector of orders, that will be passed to the // hstable_manager_compaction_ object to persist them on disk log::trace("Compaction()", "Step 6: Build order list"); std::vector orders; uint64_t timestamp_max = 0; for (auto it = fileids_compaction.begin(); it != fileids_compaction.end(); ++it) { uint32_t fileid = *it; if (IsFileLarge(fileid)) continue; Mmap* mmap = mmaps[fileid]; // Read the header to update the maximimum timestamp struct HSTableHeader hstheader; s = HSTableHeader::DecodeFrom(mmap->datafile(), mmap->filesize(), &hstheader); if (!s.IsOK()) return Status::IOError("Could not read file header during compaction"); // TODO: skip file instead of returning an error timestamp_max = std::max(timestamp_max, hstheader.timestamp); // Read the footer to get the offset where entries stop struct HSTableFooter footer; Status s = HSTableFooter::DecodeFrom(mmap->datafile() + mmap->filesize() - HSTableFooter::GetFixedSize(), HSTableFooter::GetFixedSize(), &footer); uint32_t crc32_computed = crc32c::Value(mmap->datafile() + footer.offset_indexes, mmap->filesize() - footer.offset_indexes - 4); uint64_t offset_end; if ( !s.IsOK() || footer.magic_number != HSTableManager::get_magic_number() || footer.crc32 != crc32_computed) { // TODO: handle error offset_end = mmap->filesize(); log::trace("Compaction()", "Compaction - invalid footer"); } else { offset_end = footer.offset_indexes; } // Create a set of what's in the OffsetArray of the HSTable being // handled in this iteration std::set offset_array; std::multimap index_hstable; s = hstable_manager_.LoadFile(*mmap, fileid, index_hstable); if (!s.IsOK()) { log::warn("HSTableManager::Compaction()", "Could not load index in file id [%u]", fileid); } for (auto &p: index_hstable) { offset_array.insert(p.second & 0xFFFFFFFF); // insert the offset } index_hstable.clear(); // Process entries in the file uint32_t offset = db_options_.internal__hstable_header_size; while (offset < offset_end) { log::trace("Compaction()", "order list loop - offset:%u offset_end:%u", offset, offset_end); struct EntryHeader entry_header; uint32_t size_header; Status s = EntryHeader::DecodeFrom(db_options_, read_options, mmap->datafile() + offset, mmap->filesize() - offset, &entry_header, &size_header); // NOTE: No need to verify the checksum. See notes in ProcessingLoopData() and RecoverFile(). if ( !s.IsOK() || !entry_header.AreSizesValid(offset, mmap->filesize())) { log::trace("Compaction()", "Unexpected end of file - IsOK:%d, offset:%u, size_key:%" PRIu64 ", size_value_offset:%" PRIu64 ", mmap->filesize():%d\n", s.IsOK(), offset, entry_header.size_key, entry_header.size_value_offset(), mmap->filesize()); entry_header.print(); break; } // TODO-19: make function to get location from fileid and offset, and the // fileid and offset from location uint64_t fileid_shifted = fileid; fileid_shifted <<= 32; uint64_t location = fileid_shifted | offset; log::trace("Compaction()", "order list loop - check if we should keep it - fileid:%u offset:%u", fileid, offset); // NOTE: This is where invalid entries are deleted: if an entry is in the // file but not in the offset array, that means that the write of that // entry never finished, thus during the compaction, the entry is simply // ignored, and the storage space it was using will simply be reclaimed. if ( offset_array.find(offset) == offset_array.end() || locations_delete.find(location) != locations_delete.end() || locations_secondary.find(location) != locations_secondary.end()) { offset += size_header + entry_header.size_key + entry_header.size_value_offset(); continue; } std::vector locations; if (hashedkeys_clusters.find(location) == hashedkeys_clusters.end()) { log::trace("Compaction()", "order list loop - does not have cluster"); locations.push_back(location); } else { log::trace("Compaction()", "order list loop - has cluster of %d items", hashedkeys_clusters[location].size()); locations = hashedkeys_clusters[location]; } //for (auto it_location = locations.begin(); it_location != locations.end(); ++it_location) { //uint64_t location = *it_location; WriteOptions write_options; for (auto& location: locations) { uint32_t fileid_location = (location & 0xFFFFFFFF00000000) >> 32; uint32_t offset_file = location & 0x00000000FFFFFFFF; log::trace("Compaction()", "order list loop - location fileid:%u offset:%u", fileid_location, offset_file); Mmap *mmap_location = mmaps[fileid_location]; struct EntryHeader entry_header; uint32_t size_header; Status s = EntryHeader::DecodeFrom(db_options_, read_options, mmap->datafile() + offset, mmap->filesize() - offset, &entry_header, &size_header); log::trace("Compaction()", "order list loop - create byte arrays"); ByteArray key = NewPointerByteArray(mmap_location->datafile() + offset_file + size_header, entry_header.size_key); ByteArray chunk = NewPointerByteArray(mmap_location->datafile() + offset_file + size_header + entry_header.size_key, entry_header.size_value_used()); log::trace("Compaction()", "order list loop - push_back() orders"); bool is_large = false; orders.push_back(Order{std::this_thread::get_id(), write_options, OrderType::Put, key, chunk, 0, entry_header.size_value, entry_header.size_value_compressed, entry_header.checksum_content, is_large}); } offset += size_header + entry_header.size_key + entry_header.size_value_offset(); } } if (IsStopRequested()) return Status::IOError("Stop was requested"); // 7. Write compacted orders on secondary storage log::trace("Compaction()", "Step 7: Write compacted files"); std::multimap map_index; // All the resulting files will have the same timestamp, which is the // maximum of all the timestamps in the set of files that have been // compacted. This will allow the resulting files to be properly ordered // during the next database startup or recovery process. hstable_manager_compaction_.Reset(); hstable_manager_compaction_.LockSequenceTimestamp(timestamp_max); hstable_manager_compaction_.WriteOrdersAndFlushFile(orders, map_index); hstable_manager_compaction_.CloseCurrentFile(); orders.clear(); mmaps.clear(); if (IsStopRequested()) return Status::IOError("Stop was requested"); // 8. Get fileid range from hstable_manager_ uint32_t num_files_compacted = hstable_manager_compaction_.GetSequenceFileId(); uint32_t offset_fileid = hstable_manager_.IncrementSequenceFileId(num_files_compacted) - num_files_compacted; log::trace("Compaction()", "Step 8: num_files_compacted:%u offset_fileid:%u", num_files_compacted, offset_fileid); if (IsStopRequested()) return Status::IOError("Stop was requested"); // 9. Rename files for (uint32_t fileid = 1; fileid <= num_files_compacted; fileid++) { uint32_t fileid_new = fileid + offset_fileid; log::trace("Compaction()", "Renaming [%s] into [%s]", hstable_manager_compaction_.GetFilepath(fileid).c_str(), hstable_manager_.GetFilepath(fileid_new).c_str()); if (std::rename(hstable_manager_compaction_.GetFilepath(fileid).c_str(), hstable_manager_.GetFilepath(fileid_new).c_str()) != 0) { log::emerg("Compaction()", "Could not rename file: %s", strerror(errno)); // TODO: crash here } uint64_t filesize = hstable_manager_compaction_.file_resource_manager.GetFileSize(fileid); hstable_manager_.file_resource_manager.SetFileSize(fileid_new, filesize); hstable_manager_.file_resource_manager.SetFileCompacted(fileid_new); } if (IsStopRequested()) return Status::IOError("Stop was requested"); // 10. Shift returned locations to match renamed files log::trace("Compaction()", "Step 10: Shifting locations"); std::multimap map_index_shifted; for (auto &p: map_index) { const uint64_t& hashedkey = p.first; const uint64_t& location = p.second; uint32_t fileid = (location & 0xFFFFFFFF00000000) >> 32; uint32_t offset_file = location & 0x00000000FFFFFFFF; uint32_t fileid_new = fileid + offset_fileid; uint64_t fileid_shifted = fileid_new; fileid_shifted <<= 32; uint64_t location_new = fileid_shifted | offset_file; log::trace("Compaction()", "Shifting [%" PRIu64 "] into [%" PRIu64 "] (fileid [%u] to [%u])", location, location_new, fileid, fileid_new); map_index_shifted.insert(std::pair(hashedkey, location_new)); } map_index.clear(); if (IsStopRequested()) return Status::IOError("Stop was requested"); // 11. Add the large entries to be kept to the map that will update the 'index_' map_index_shifted.insert(hashedkeys_to_locations_large_keep.begin(), hashedkeys_to_locations_large_keep.end()); if (IsStopRequested()) return Status::IOError("Stop was requested"); // 12. Update the storage engine index_, by removing the locations that have // been compacted, and making sure that the locations that have been // added while the compaction was taking place are not removed log::trace("Compaction()", "Step 12: Update the storage engine index_"); int num_iterations_per_lock = db_options_.internal__num_iterations_per_lock; int counter_iterations = 0; for (auto it = map_index_shifted.begin(); it != map_index_shifted.end(); it = map_index_shifted.upper_bound(it->first)) { if (counter_iterations == 0) { AcquireWriteLock(); } counter_iterations += 1; // For each hashed key, get the group of locations from the index_: all the locations // in that group have already been handled during the compaction, except for the ones // that have fileids larger than the max fileid 'fileid_end_actual' -- call these 'locations_after'. const uint64_t& hashedkey = it->first; auto range_index = index_.equal_range(hashedkey); std::vector locations_after; for (auto it_bucket = range_index.first; it_bucket != range_index.second; ++it_bucket) { const uint64_t& location = it_bucket->second; uint32_t fileid = (location & 0xFFFFFFFF00000000) >> 32; if (fileid > fileid_end_actual) { // Save all the locations for files with fileid that were not part of // the compaction process locations_after.push_back(location); } } // Erase the bucket, insert the locations from the compaction process, and // then insert the locations from the files that were not part of the // compaction process, 'locations_after' index_.erase(hashedkey); auto range_compaction = map_index_shifted.equal_range(hashedkey); index_.insert(range_compaction.first, range_compaction.second); for (auto p = locations_after.begin(); p != locations_after.end(); ++p) { index_.insert(std::pair(hashedkey, *p)); } // Throttling the index updates, and allows other processes // to acquire the write lock if they need it if (counter_iterations >= num_iterations_per_lock) { ReleaseWriteLock(); counter_iterations = 0; } } if (counter_iterations) ReleaseWriteLock(); if (IsStopRequested()) return Status::IOError("Stop was requested"); // 13. Put all the locations inserted after the compaction started // stored in 'index_compaction_' into the main index 'index_' log::trace("Compaction()", "Step 13: Transfer index_compaction_ into index_"); AcquireWriteLock(); // TODO-38: The pouring of index_compaction_ needs to be throttled just like the // update of index_ above. The problem is that if the lock is acquire // for a limited time only, then another thread could come in and want // to write to the database as well: to which index should it write // then, index_ or index_compaction_? More concerning, if it writes to // index_compaction_, this would mean that it would be writing to // index_compaction_ as an iterator is going over index_compaction_, // which would be just plain wrong. This problem will require more // thinking, for now, just lock for longer and risk to cause timeouts: // better be late than buggy. index_.insert(index_compaction_.begin(), index_compaction_.end()); mutex_compaction_.lock(); is_compaction_in_progress_ = false; mutex_compaction_.unlock(); ReleaseWriteLock(); index_compaction_.clear(); if (IsStopRequested()) return Status::IOError("Stop was requested"); // 14. Delete compacted files log::trace("Compaction()", "Step 14: Delete compacted files"); mutex_snapshot_.lock(); if (snapshotids_to_fileids_.size() == 0) { // No snapshots are in progress, remove the files on the spot for (auto& fileid: fileids_compaction) { if (fileids_largefiles_keep.find(fileid) != fileids_largefiles_keep.end()) continue; log::trace("Compaction()", "Removing [%s]", hstable_manager_.GetFilepath(fileid).c_str()); // TODO: free memory associated with the removed file in the file resource manager if (std::remove(hstable_manager_.GetFilepath(fileid).c_str()) != 0) { log::emerg("Compaction()", "Could not remove file [%s]", hstable_manager_.GetFilepath(fileid).c_str()); } hstable_manager_.file_resource_manager.ClearAllDataForFileId(fileid); } } else { // Snapshots are in progress, therefore mark the files and they will be removed when the snapshots are released int num_snapshots = snapshotids_to_fileids_.size(); for (auto& fileid: fileids_compaction) { if (fileids_largefiles_keep.find(fileid) != fileids_largefiles_keep.end()) continue; for (auto& p: snapshotids_to_fileids_) { snapshotids_to_fileids_[p.first].insert(fileid); } if (num_references_to_unused_files_.find(fileid) == num_references_to_unused_files_.end()) { num_references_to_unused_files_[fileid] = 0; } num_references_to_unused_files_[fileid] += num_snapshots; // Create lock file std::string filepath_lock = hstable_manager_.GetLockFilepath(fileid); int fd; if ((fd = open(filepath_lock.c_str(), O_WRONLY|O_CREAT, 0644)) < 0) { log::emerg("StorageEngine::Compaction()", "Could not open file [%s]: %s", filepath_lock.c_str(), strerror(errno)); } close(fd); } } mutex_snapshot_.unlock(); if (IsStopRequested()) return Status::IOError("Stop was requested"); // Cleanup pre-allocated files FileUtil::remove_files_with_prefix(dbname.c_str(), prefix_compaction_); return Status::OK(); } void Compact() { force_compaction_ = true; while (true) { cv_loop_compaction_.notify_all(); int has_compacted_all_files = event_manager_->compaction_status.Wait(); event_manager_->compaction_status.Done(); log::trace("Compact()", "has_compacted_all_files: %d\n", has_compacted_all_files); if (has_compacted_all_files) break; } force_compaction_ = false; } // START: Helpers for Snapshots // Caller must delete fileids_ignore Status GetNewSnapshotData(uint32_t *snapshot_id, std::set **fileids_ignore) { std::unique_lock lock(mutex_snapshot_); *snapshot_id = IncrementSequenceSnapshot(1); *fileids_ignore = new std::set(); for (auto& p: num_references_to_unused_files_) { (*fileids_ignore)->insert(p.first); } return Status::OK(); } Status ReleaseSnapshot(uint32_t snapshot_id) { std::unique_lock lock(mutex_snapshot_); if (snapshotids_to_fileids_.find(snapshot_id) == snapshotids_to_fileids_.end()) { return Status::IOError("No snapshot with specified id"); } for (auto& fileid: snapshotids_to_fileids_[snapshot_id]) { if(num_references_to_unused_files_[fileid] == 1) { log::trace("ReleaseSnapshot()", "Removing [%s]", hstable_manager_.GetFilepath(fileid).c_str()); if (std::remove(hstable_manager_.GetFilepath(fileid).c_str()) != 0) { log::emerg("ReleaseSnapshot()", "Could not remove file [%s]", hstable_manager_.GetFilepath(fileid).c_str()); } if (std::remove(hstable_manager_.GetLockFilepath(fileid).c_str()) != 0) { log::emerg("ReleaseSnapshot()", "Could not lock file [%s]", hstable_manager_.GetLockFilepath(fileid).c_str()); } hstable_manager_.file_resource_manager.ClearAllDataForFileId(fileid); num_references_to_unused_files_.erase(fileid); } else { num_references_to_unused_files_[fileid] -= 1; } } snapshotids_to_fileids_.erase(snapshot_id); return Status::OK(); } Status ReleaseAllSnapshots() { for (auto& p: snapshotids_to_fileids_) { Status s = ReleaseSnapshot(p.first); if (!s.IsOK()) return s; } return Status::OK(); } uint64_t GetSequenceSnapshot() { std::unique_lock lock(mutex_sequence_snapshot_); return sequence_snapshot_; } uint64_t IncrementSequenceSnapshot(uint64_t inc) { std::unique_lock lock(mutex_sequence_snapshot_); sequence_snapshot_ += inc; return sequence_snapshot_; } std::string GetFilepath(uint32_t fileid) { return hstable_manager_.GetFilepath(fileid); } uint32_t FlushCurrentFileForSnapshot() { return hstable_manager_.FlushCurrentFile(1, 0); } uint32_t FlushCurrentFileForForcedCompaction() { return hstable_manager_.FlushCurrentFile(1, 0); } std::vector* GetFileidsIterator() { return fileids_iterator_; } // END: Helpers for Snapshots uint64_t GetDbSizeUncompacted() { return hstable_manager_.file_resource_manager.GetDbSizeUncompacted(); } private: void AcquireWriteLock() { // Also waits for readers to finish // NOTE: should this be made its own templated class? mutex_write_.lock(); while(true) { std::unique_lock lock_read(mutex_read_); if (num_readers_ == 0) break; cv_read_.wait(lock_read); } } void ReleaseWriteLock() { mutex_write_.unlock(); } // Options DatabaseOptions db_options_; EventManager *event_manager_; Hash *hash_; bool is_read_only_; std::set* fileids_ignore_; std::string prefix_compaction_; std::string dirpath_locks_; // Data std::string dbname_; HSTableManager hstable_manager_; std::map data_; std::thread thread_data_; std::condition_variable cv_read_; std::mutex mutex_read_; std::mutex mutex_write_; int num_readers_; std::shared_ptr file_manager_; // Index std::multimap index_; std::multimap index_compaction_; std::thread thread_index_; //std::mutex mutex_index_; // Compaction HSTableManager hstable_manager_compaction_; std::condition_variable cv_loop_compaction_; std::mutex mutex_loop_compaction_; std::mutex mutex_compaction_; bool is_compaction_in_progress_; std::thread thread_compaction_; std::map num_references_to_unused_files_; bool force_compaction_; // Statistics std::mutex mutex_statistics_; std::thread thread_statistics_; std::condition_variable cv_statistics_; uint64_t fs_free_space_; // in bytes // Snapshot std::mutex mutex_snapshot_; std::map< uint32_t, std::set > snapshotids_to_fileids_; std::mutex mutex_sequence_snapshot_; uint32_t sequence_snapshot_; std::vector *fileids_iterator_; // Stopping and closing bool stop_requested_; bool is_closed_; std::mutex mutex_close_; }; } // namespace kdb #endif // KINGDB_STORAGE_ENGINE_H_ ================================================ FILE: thread/event_manager.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_EVENT_MANAGER_H_ #define KINGDB_EVENT_MANAGER_H_ #include "util/debug.h" #include #include #include #include namespace kdb { template class Event { public: Event() { has_data = false; } void StartAndBlockUntilDone(T& data) { std::unique_lock lock_start(mutex_unique_); std::unique_lock lock(mutex_); data_ = data; has_data = true; cv_ready_.notify_one(); cv_done_.wait(lock); } T Wait() { std::unique_lock lock(mutex_); if (!has_data) { cv_ready_.wait(lock); } return data_; } void Done() { std::unique_lock lock(mutex_); has_data = false; cv_done_.notify_one(); } void NotifyWait() { cv_ready_.notify_one(); } private: T data_; bool has_data; std::mutex mutex_; // protect the data held in the object std::mutex mutex_unique_; // make sure only one thread can enter the Start method std::condition_variable cv_ready_; std::condition_variable cv_done_; }; class EventManager { public: EventManager() {} Event> flush_buffer; Event> update_index; Event clear_buffer; Event compaction_status; }; } #endif // KINGDB_EVENT_MANAGER_H_ ================================================ FILE: thread/threadpool.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_THREADPOOL_H_ #define KINGDB_THREADPOOL_H_ #include "util/debug.h" #include #include #include #include #include #include namespace kdb { class Task { public: Task():stop_requested_(false) {} virtual ~Task() {} virtual void RunInLock(std::thread::id tid) = 0; virtual void Run(std::thread::id tid, uint64_t id) = 0; bool IsStopRequested() { return stop_requested_; } void Stop() { stop_requested_ = true; } bool stop_requested_; }; class ThreadPool { // TODO: What if too many items are incoming? add_task() // must return an error beyond a certain limit, or timeout // TODO: What if a run() method throws an exception? => force it to be noexcept? // TODO: Impose limit on number of items in queue -- for thread pool over // sockets, the queue should be of size 0 // TODO: Verify that the Stop() method on the tasks makes the workers stop as // expected. // TODO: Protect accesses to tid_to_id_ and tid_to_task_ with mutexes // TODO: Is it possible that some threads die and never get re-created? Thus // the pool of threads could go down to none, and no processeing would // happen. public: int num_threads_; uint64_t seq_id; bool stop_requested_; std::queue queue_; std::condition_variable cv_; std::mutex mutex_; std::vector threads_; std::map tid_to_id_; std::map tid_to_task_; ThreadPool(int num_threads) : num_threads_(num_threads), seq_id(0), stop_requested_(false) { } ~ThreadPool() { } void ProcessingLoop() { while (!IsStopRequested()) { std::unique_lock lock(mutex_); if (queue_.empty()) { cv_.wait(lock); if (IsStopRequested()) continue; } if (queue_.empty()) continue; Task* task = queue_.front(); queue_.pop(); if (task == nullptr) continue; auto tid = std::this_thread::get_id(); auto it_find = tid_to_id_.find(tid); uint64_t id = 0; if (it_find == tid_to_id_.end()) id = seq_id++; tid_to_id_[tid] = id; tid_to_task_[tid] = task; task->RunInLock(tid); lock.unlock(); task->Run(tid, id); mutex_.lock(); if (!IsStopRequested()) delete task; tid_to_task_.erase(tid); mutex_.unlock(); } } void AddTask(Task* task) { std::unique_lock lock(mutex_); queue_.push(task); cv_.notify_one(); } void Start() { // NOTE: Should each thread run the loop, or should the loop be running in a // main thread that is then dispatching work by notifying other // threads? for (auto i = 0; i < num_threads_; i++) { threads_.push_back(std::thread(&ThreadPool::ProcessingLoop, this)); } } void Stop() { stop_requested_ = true; cv_.notify_all(); for (auto& t: threads_) { t.join(); } mutex_.lock(); for (auto& tid_task: tid_to_task_) { Task* task = tid_task.second; task->Stop(); delete task; } while (!queue_.empty()) { Task *task = queue_.front(); queue_.pop(); delete task; } mutex_.unlock(); } void BlockUntilAllTasksHaveCompleted() { while (!queue_.empty()) { // TODO: protect accesses to queue_ with a mutex std::this_thread::sleep_for(std::chrono::milliseconds(500)); } Stop(); } bool IsStopRequested() { return stop_requested_; } }; } #endif // KINGDB_THREADPOOL_H_ ================================================ FILE: thread/threadstorage.h ================================================ // Copyright (c) 2014, Emmanuel Goossaert. All rights reserved. // Use of this source code is governed by the BSD 3-Clause License, // that can be found in the LICENSE file. #ifndef KINGDB_THREADSTORAGE_H_ #define KINGDB_THREADSTORAGE_H_ #include "util/debug.h" #include #include