Repository: nadavrot/memset_benchmark Branch: main Commit: ed6cd499f55a Files: 21 Total size: 53.9 KB Directory structure: gitextract_j6tepqkv/ ├── .gitignore ├── CMakeLists.txt ├── README.md ├── docs/ │ └── annotated_glibc.txt ├── include/ │ ├── decl.h │ ├── types.h │ └── utils.h └── src/ ├── memcpy/ │ ├── CMakeLists.txt │ ├── bench_memcpy.cc │ ├── folly.S │ ├── impl.S │ ├── impl.c │ └── test_memcpy.cc ├── memset/ │ ├── CMakeLists.txt │ ├── bench_memset.cc │ ├── impl.S │ ├── impl.c │ ├── shims.c │ └── test_memset.cc └── utils/ ├── CMakeLists.txt └── hist_tool.c ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ *.o *.swn *.swo *.swp *~ .DS_Store *.so *.dylib GPATH GRTAGS GTAGS tags compile_commands.json toolchain/ llvm-project/ gcc-project/ build*/ .vscode/ .vim/ .idea/ ================================================ FILE: CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.7) project(bpf_tracer VERSION 1.0.0 DESCRIPTION "Memset benchmarks") set(CMAKE_CXX_STANDARD 14) set(CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) # Export a JSON file with the compilation commands that external tools can use # to analyze the source code of the project. set(CMAKE_EXPORT_COMPILE_COMMANDS ON) enable_language(C ASM) # Disable exceptions SET (CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "-fno-rtti ") if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) message(STATUS "No build type selected, default to Release") set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Build type (default RelWithDebInfo)" FORCE) endif() add_compile_options(-Wall -g3 -O3 -march=native) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall -march=native") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -fno-omit-frame-pointer -O0") # Place all of the binaries in the build directory. set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) include_directories(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/include ) add_subdirectory(src/memset/) add_subdirectory(src/memcpy/) add_subdirectory(src/utils/) ================================================ FILE: README.md ================================================ # Fast Memset and Memcpy implementations *UPDATE*: Ilya Albrecht landed the memset implementation from this repo into [Folly](https://github.com/facebook/folly/blob/main/folly/memset.S). This repository contains high-performance implementations of memset and memcpy. These implementations outperform the folly and glibc implementations. This repository contains several reference implementations in C and assembly. The high-performance implementations are found in the files called "impl.S". Before reading the source code in this repository you probably want to read an excellent blog [post](https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines/) by Joe Bialek about his work to optimize memset for windows. The charts below compare the code in this repo with other implementations: folly, musl, and glibc. The glibc implementations are measured with and without the elf indirection, as suggested by Dave Zarzycki. ## Memset ![Memset](docs/memset_bench.png) ## Memcpy ![Memcpy](docs/memcpy_bench.png) The chart below compares the performance of different memset implementations on buffers of varying sizes and offsets. Unlike the hot loop that hammers a single value, this benchmark is more realistic and takes into account mispredicted branches and the performance of the cpu decoder. The buffers are in the size range 0 to 256. The random function is made of pre-computed random values, to lower the overhead of the random function. This was suggested by Yann Collet. The 'nop' function is used to compute the benchmark setup and call overhead. The numbers below represent the implementation execution time minus the nop function time. ![memset](docs/memset_r.png) ![memcpy](docs/memcpy_r.png) The size of the buffer that memset and memcpy mutates is typically small. The picture below presents the buffer length distribution in google-chrome. Vim, Python, and even server workloads have a similar distribution. The values in the chart represent the power of two buffer size (10 represents the values between 512 and 1024). ![Histogram](docs/hist.png) The chart below presents a histogram of pointer alignment (from the game minecraft). Most of the pointers that are called by memset and memcpy are aligned to 8-byte values. Some programs have histograms that are not as sharp, meaning that there are more values that are not aligned to 4 or 8-byte boundary. ![Pointer Alignment](docs/align.png) Memcpy and Memset and frequently called by low-level high-performance libraries. Here is one example of one stack trace from the Firefox codebase: ``` (gdb) bt #0 __memmove_avx_unaligned_erms () at ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:225 #1 in memcpy (__dest=, __src=, __len=40) at /usr/include/x86_64-linux-gnu/bits/string_fortified.h:34 #2 mozilla::BufferList::ReadBytes(mozilla::BufferList::IterImpl&, char*, unsigned long) const #3 Pickle::ReadBytesInto(PickleIterator*, void*, unsigned int) const (this=, iter=, data=, length=) #4 in IPC::Message::ReadFooter(void*, unsigned int, bool) (this=, buffer=, buffer_len=40, truncate=true) #5 in mozilla::ipc::NodeController::DeserializeEventMessage(mozilla::UniquePtr >) (this=, aMessage=...) #6 in mozilla::ipc::NodeController::OnEventMessage(mojo::core::ports::NodeName const&, mozilla::UniquePtr >) #7 in mozilla::ipc::NodeChannel::OnMessageReceived(IPC::Message&&) (this=, aMessage=...) #8 in IPC::Channel::ChannelImpl::ProcessIncomingMessages() (this=) #9 in IPC::Channel::ChannelImpl::OnFileCanReadWithoutBlocking(int) (this=, fd=) #10 in base::MessagePumpLibevent::OnLibeventNotification(int, short, void*) (fd=, flags=, context=) #11 in event_persist_closure (base=, ev=) at /build/firefox-HSiFn6/firefox-94.0+build3/ipc/chromium/src/third_party/libevent/event.c:1580 #12 event_process_active_single_queue (base=, activeq=, max_to_process=, endtime=) ``` The repository contains a few utilities for testing and measuring the performance and correctness of memset and memcpy. ## Test tool This is a small test harness that verifies the correctness of the implementations. It's really easy to make mistakes with off-by-one errors and run into alignment issues. The exhaustive tester catches these issues. This is a sample output: ``` OOOOOOOOOOOXX ^ Filling a buffer of length 13. Expected "O" at index 11 ``` ## Benchmark tool The benchmark tool measures the performance of the system libc and the local implementation. The benchmarking tool runs each of the implementations in a loop millions of times. It runs the benchmark several times and picks the least noisy results. It's a good idea to run the benchmark tool and compare some implementation to itself to assess the noise level in the system. The benchmarking tool uses a trampoline to prevent the compiler from inlining and expanding the memset. ## Histogram tool The histogram tool is a shared object that collects records calls to memset and memcpy and creates a histogram of the length parameter. It prints the histogram when the program exits cleanly. The shared object can be loaded using LD\_PRELOAD (on Linux) or DYLD\_INSERT\_LIBRARIES (on Mac). Each bucket in the output represents the log2 size of the buffer, and each value represents the number of hits for the bucket. ## Proxy tool This is a small utility that swaps the builtin call to memset and memcpy with the local implementation from this project. The shared object can be loaded using LD\_PRELOAD (on Linux) or DYLD\_INSERT\_LIBRARIES (on Mac). ================================================ FILE: docs/annotated_glibc.txt ================================================ <+0>: endbr64 <+4>: vmovd %esi, %xmm0 <+8>: movq %rdi, %rax <+11>: vpbroadcastb %xmm0, %ymm0 <+16>: cmpq $0x20, %rdx <+20>: jb 0xBELOW_32____ ; <+190> <+26>: cmpq $0x40, %rdx <+30>: ja 0xABOVE_64____ ; <+46> <+32>: vmovdqu %ymm0, -0x20(%rdi,%rdx) <+38>: vmovdqu %ymm0, (%rdi) <+42>: vzeroupper <+45>: retq 0xABOVE_64____ <+46>: cmpq $0x800, %rdx ; imm = 0x800 <+53>: ja 0xABOVE_2048__ ; ___lldb_unnamed_symbol1097$$libc.so.6 + 4 <+55>: cmpq $0x80, %rdx <+62>: ja 0xABOVE_128___ ; <+89> 0xSZ_64_TO_128 <+64>: vmovdqu %ymm0, (%rdi) <+68>: vmovdqu %ymm0, 0x20(%rdi) <+73>: vmovdqu %ymm0, -0x20(%rdi,%rdx) <+79>: vmovdqu %ymm0, -0x40(%rdi,%rdx) 0xEXIT_EXIT___ <+85>: vzeroupper <+88>: retq 0xABOVE_128___ <+89>: leaq 0x80(%rdi), %rcx <+96>: vmovdqu %ymm0, (%rdi) <+100>: andq $-0x80, %rcx <+104>: vmovdqu %ymm0, -0x20(%rdi,%rdx) <+110>: vmovdqu %ymm0, 0x20(%rdi) <+115>: vmovdqu %ymm0, -0x40(%rdi,%rdx) <+121>: vmovdqu %ymm0, 0x40(%rdi) <+126>: vmovdqu %ymm0, -0x60(%rdi,%rdx) <+132>: vmovdqu %ymm0, 0x60(%rdi) <+137>: vmovdqu %ymm0, -0x80(%rdi,%rdx) <+143>: addq %rdi, %rdx <+146>: andq $-0x80, %rdx <+150>: cmpq %rdx, %rcx <+153>: je 0xEXIT_EXIT___ ; <+85> 0xLOOP_4x32B__ <+155>: vmovdqa %ymm0, (%rcx) <+159>: vmovdqa %ymm0, 0x20(%rcx) <+164>: vmovdqa %ymm0, 0x40(%rcx) <+169>: vmovdqa %ymm0, 0x60(%rcx) <+174>: addq $0x80, %rcx <+181>: cmpq %rcx, %rdx <+184>: jne 0xLOOP_4x32B__ ; <+155> <+186>: vzeroupper <+189>: retq 0xBELOW_32____ <+190>: cmpb $0x10, %dl <+193>: jae 0xBELOW_16____ ; <+223> <+195>: vmovq %xmm0, %rcx <+200>: cmpb $0x8, %dl <+203>: jae 0xABOVE_8_____ ; <+237> <+205>: cmpb $0x4, %dl <+208>: jae 0xABOVE_4_____ ; <+249> <+210>: cmpb $0x1, %dl <+213>: ja 0xABOVE_1_____ ; <+259> <+215>: jb 0xIS_ZERO_CASE ; <+219> <+217>: movb %cl, (%rdi) 0xIS_ZERO_CASE <+219>: vzeroupper <+222>: retq 0xBELOW_16____ <+223>: vmovdqu %xmm0, -0x10(%rdi,%rdx) <+229>: vmovdqu %xmm0, (%rdi) <+233>: vzeroupper <+236>: retq 0xABOVE_8_____ <+237>: movq %rcx, -0x8(%rdi,%rdx) <+242>: movq %rcx, (%rdi) <+245>: vzeroupper <+248>: retq 0xABOVE_4____ <+249>: movl %ecx, -0x4(%rdi,%rdx) <+253>: movl %ecx, (%rdi) <+255>: vzeroupper <+258>: retq 0xABOVE_1_____ <+259>: movw %cx, -0x2(%rdi,%rdx) <+264>: movw %cx, (%rdi) <+267>: vzeroupper <+270>: retq <+271>: nop ================================================ FILE: include/decl.h ================================================ #ifndef DECLS #define DECLS #include #ifdef __cplusplus using memset_ty = void *(void *s, int c, size_t n); using memcpy_ty = void *(void *dest, const void *src, size_t n); extern "C" { #endif void *memcpy(void *dest, const void *src, size_t n); void *__folly_memcpy(void *dest, const void *src, size_t n); void *libc_memcpy(void *dest, const void *src, size_t n); void *local_memcpy(void *dest, const void *src, size_t n); void *asm_memcpy(void *dest, const void *src, size_t n); void *memset(void *s, int c, size_t n); void *libc_memset(void *s, int c, size_t n); void *local_memset(void *s, int c, size_t n); void *asm_memset(void *s, int c, size_t n); void *musl_memset(void *s, int c, size_t n); #ifdef __cplusplus } #endif #endif // DECLS ================================================ FILE: include/types.h ================================================ #ifndef TYPES #define TYPES #include #define NO_INLINE __attribute__((noinline)) #ifdef __clang__ typedef char char8 __attribute__((ext_vector_type(8), aligned(1))); typedef char char16 __attribute__((ext_vector_type(16), aligned(1))); typedef char char32 __attribute__((ext_vector_type(32), aligned(1))); typedef char char32a __attribute__((ext_vector_type(32), aligned(32))); #else // __GNUC__ typedef char char8 __attribute__((vector_size(8), aligned(1))); typedef char char16 __attribute__((vector_size(16), aligned(1))); typedef char char32 __attribute__((vector_size(32), aligned(1))); typedef char char32a __attribute__((vector_size(32), aligned(32))); #endif typedef uint32_t __attribute__((aligned(1))) u32; typedef uint64_t __attribute__((aligned(1))) u64; #endif // TYPES ================================================ FILE: include/utils.h ================================================ #ifndef UTILS_H #define UTILS_H #include #include #include #include "types.h" /// Aligns the pointer \p ptr, to alignment \p alignment and offset \p offset /// within the word. void *align_pointer(void *ptr, unsigned alignment, unsigned offset) { size_t p = (size_t)ptr; while (p % alignment) ++p; return (void *)(p + (size_t)offset); } using time_point = std::chrono::steady_clock::time_point; class Stopwatch { /// The time of the last sample; time_point begin_; /// A list of recorded intervals. std::vector intervals_; public: NO_INLINE Stopwatch() : begin_() {} NO_INLINE void start() { begin_ = std::chrono::steady_clock::now(); } NO_INLINE void stop() { time_point end = std::chrono::steady_clock::now(); uint64_t interval = std::chrono::duration_cast(end - begin_) .count(); intervals_.push_back(interval); } NO_INLINE uint64_t get_median() { std::sort(intervals_.begin(), intervals_.end()); return intervals_[intervals_.size() / 2]; } }; uint8_t random_bytes[320] = { 227, 138, 244, 198, 73, 247, 185, 248, 229, 75, 24, 215, 159, 230, 136, 246, 200, 144, 65, 67, 109, 86, 118, 61, 209, 103, 188, 213, 187, 8, 210, 121, 214, 178, 232, 59, 153, 92, 209, 239, 44, 85, 156, 172, 237, 41, 150, 195, 247, 202, 249, 142, 208, 133, 21, 204, 114, 38, 51, 150, 194, 46, 184, 138, 50, 250, 190, 180, 161, 5, 211, 191, 62, 137, 142, 122, 63, 72, 233, 125, 189, 51, 238, 51, 116, 10, 44, 18, 240, 41, 157, 81, 183, 252, 214, 17, 81, 12, 44, 119, 77, 97, 101, 80, 106, 128, 190, 89, 160, 104, 244, 192, 46, 69, 73, 255, 45, 213, 190, 86, 18, 89, 34, 46, 134, 145, 166, 128, 87, 97, 192, 71, 105, 94, 51, 30, 7, 9, 0, 40, 0, 187, 205, 189, 151, 159, 107, 105, 180, 182, 233, 52, 209, 108, 186, 31, 184, 254, 170, 71, 162, 31, 80, 226, 75, 125, 214, 125, 247, 197, 149, 132, 247, 157, 253, 101, 107, 1, 127, 236, 249, 242, 152, 169, 123, 240, 129, 230, 135, 25, 57, 227, 130, 189, 76, 254, 33, 193, 39, 82, 177, 143, 31, 17, 20, 195, 219, 165, 171, 198, 125, 119, 216, 143, 55, 210, 17, 88, 150, 126, 38, 160, 71, 214, 10, 162, 158, 6, 234, 233, 119, 221, 167, 62, 146, 50, 150, 176, 142, 167, 201, 250, 195, 26, 156, 96, 36, 177, 95, 23, 7, 63, 55, 142, 80, 227, 73, 124, 93, 211, 231, 166, 182, 57, 145, 55, 242, 213, 246, 30, 146, 247, 19, 229, 34, 210, 37, 147, 242, 103, 125, 91, 171, 51, 22, 126, 248, 149, 19, 60, 89, 5, 241, 132, 72, 217, 195, 11, 173, 247, 47, 144, 222, 94, 51, 166, 192, 50, 109, 62, 42, 126, 111, 204, 141, 66, }; /// Implements a doom-style random number generator. struct DoomRNG { // Points to the current random number. unsigned rand_curr = 0; void rand_reset() { rand_curr = 0; } uint8_t next_u8_random() { return random_bytes[rand_curr++ % 320]; } }; #endif // UTILS_H ================================================ FILE: src/memcpy/CMakeLists.txt ================================================ add_executable(test_memcpy test_memcpy.cc folly.S impl.S impl.c ) target_link_libraries(test_memcpy PUBLIC) add_executable(bench_memcpy bench_memcpy.cc folly.S impl.S impl.c ) install(TARGETS bench_memcpy DESTINATION bin) install(TARGETS test_memcpy DESTINATION bin) ================================================ FILE: src/memcpy/bench_memcpy.cc ================================================ #include #include #include #include #include #include #include "decl.h" #include "utils.h" //////////////////////////////////////////////////////////////////////////////// // This is a small program that compares two memcpy implementations and records // the output in a csv file. //////////////////////////////////////////////////////////////////////////////// #define ITER (1000L * 1000L * 10L) #define SAMPLES (20) DoomRNG RNG; /// Measure a single implementation \p handle. uint64_t measure(memcpy_ty handle, void *dest, void *src, unsigned size) { Stopwatch T; for (unsigned i = 0; i < SAMPLES; i++) { T.start(); for (size_t j = 0; j < ITER; j++) { (handle)(dest, src, size); } T.stop(); } return T.get_median(); } // Allocate memory and benchmark each implementation at a specific size \p size. void bench_impl(const std::vector &toTest, unsigned size, unsigned align, unsigned offset) { std::vector dest(size + 256, 0); std::vector src(size + 256, 0); char *src_ptr = (char *)align_pointer(&src[0], align, offset); char *dest_ptr = (char *)align_pointer(&dest[0], align, offset); std::cout << size << ", "; for (auto handle : toTest) { u_int64_t res = measure(handle, dest_ptr, src_ptr, size); std::cout << res << ", "; } std::cout << std::endl; } /// Allocate and copy buffers at random offsets and in random sizes. /// The sizes and the offsets are in the range 0..256. void bench_rand_range(const std::vector &toTest) { std::vector dest(4096, 1); std::vector src(4096, 0); const char *src_p = &src[0]; char *dest_p = &dest[0]; for (auto handle : toTest) { Stopwatch T; sleep(1); for (unsigned i = 0; i < SAMPLES; i++) { RNG.rand_reset(); T.start(); for (size_t j = 0; j < ITER; j++) { char *to = dest_p + RNG.next_u8_random(); const char *from = src_p + RNG.next_u8_random(); (handle)(to, from, RNG.next_u8_random()); } T.stop(); } std::cout << T.get_median() << ", "; } std::cout << std::endl; } // To measure the call overhead. void *nop(void *dest, const void *src, size_t n) { return dest; } int main(int argc, char **argv) { std::cout << std::setprecision(3); std::cout << std::fixed; std::vector toTest = { &libc_memcpy, &memcpy, &__folly_memcpy, &local_memcpy, &asm_memcpy, &nop}; std::cout << "Batches of random sizes:\n"; std::cout << "libc@plt, libc, folly, c_memcpy, asm_memcpy, nop,\n"; bench_rand_range(toTest); std::cout << "\nFixed size:\n"; std::cout << "size, libc@plt, libc, folly, c_memcpy, asm_memcpy, nop,\n"; for (int i = 0; i < 512; i++) { bench_impl(toTest, i, 16, 0); } return 0; } ================================================ FILE: src/memcpy/folly.S ================================================ /* * Copyright (c) Facebook, Inc. and its affiliates. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * __folly_memcpy: An optimized memcpy implementation that uses prefetch and * AVX2 instructions. * * This implementation of memcpy acts as a memmove, but it is not optimized for * this purpose. While overlapping copies are undefined in memcpy, this * implementation acts like memmove for sizes up through 256 bytes and will * detect overlapping copies and call memmove for overlapping copies of 257 or * more bytes. * * This implementation uses prefetch to avoid dtlb misses. This can * substantially reduce dtlb store misses in cases where the destination * location is absent from L1 cache and where the copy size is small enough * that the hardware prefetcher doesn't have a large impact. * * The number of branches is limited by the use of overlapping copies. This * helps with copies where the source and destination cache lines are already * present in L1 because there are fewer instructions to execute and fewer * branches to potentially mispredict. * * Vector operations up to 32-bytes are used (avx2 instruction set). Larger * mov operations (avx512) are not used. * * Large copies make use of aligned store operations. This operation is * observed to always be faster than rep movsb, so the rep movsb instruction * is not used. * * If the copy size is humongous and the source and destination are both * aligned, this memcpy will use non-temporal operations. This can have * a substantial speedup for copies where data is absent from L1, but it * is significantly slower if the source and destination data were already * in L1. The use of non-temporal operations also has the effect that after * the copy is complete, the data will be moved out of L1, even if the data was * present before the copy started. * * @author Logan Evans */ #if defined(__AVX2__) // This threshold is half of L1 cache on a Skylake machine, which means that // potentially all of L1 will be populated by this copy once it is executed // (dst and src are cached for temporal copies). #define NON_TEMPORAL_STORE_THRESHOLD $32768 .file "memcpy.S" .section .text,"ax" .type __folly_memcpy_short, @function __folly_memcpy_short: .cfi_startproc .L_GE1_LE7: cmp $1, %rdx je .L_EQ1 cmp $4, %rdx jae .L_GE4_LE7 .L_GE2_LE3: movw (%rsi), %r8w movw -2(%rsi,%rdx), %r9w movw %r8w, (%rdi) movw %r9w, -2(%rdi,%rdx) ret .align 2 .L_EQ1: movb (%rsi), %r8b movb %r8b, (%rdi) ret // Aligning the target of a jump to an even address has a measurable // speedup in microbenchmarks. .align 2 .L_GE4_LE7: movl (%rsi), %r8d movl -4(%rsi,%rdx), %r9d movl %r8d, (%rdi) movl %r9d, -4(%rdi,%rdx) ret .cfi_endproc .size __folly_memcpy_short, .-__folly_memcpy_short // memcpy is an alternative entrypoint into the function named __folly_memcpy. // The compiler is able to call memcpy since the name is global while // stacktraces will show __folly_memcpy since that is the name of the function. // This is intended to aid in debugging by making it obvious which version of // memcpy is being used. .align 64 .globl __folly_memcpy .type __folly_memcpy, @function __folly_memcpy: .cfi_startproc mov %rdi, %rax test %rdx, %rdx je .L_EQ0 prefetchw (%rdi) prefetchw -1(%rdi,%rdx) cmp $8, %rdx jb .L_GE1_LE7 .L_GE8: cmp $32, %rdx ja .L_GE33 .L_GE8_LE32: cmp $16, %rdx ja .L_GE17_LE32 .L_GE8_LE16: mov (%rsi), %r8 mov -8(%rsi,%rdx), %r9 mov %r8, (%rdi) mov %r9, -8(%rdi,%rdx) .L_EQ0: ret .align 2 .L_GE17_LE32: movdqu (%rsi), %xmm0 movdqu -16(%rsi,%rdx), %xmm1 movdqu %xmm0, (%rdi) movdqu %xmm1, -16(%rdi,%rdx) ret .align 2 .L_GE193_LE256: vmovdqu %ymm3, 96(%rdi) vmovdqu %ymm4, -128(%rdi,%rdx) .L_GE129_LE192: vmovdqu %ymm2, 64(%rdi) vmovdqu %ymm5, -96(%rdi,%rdx) .L_GE65_LE128: vmovdqu %ymm1, 32(%rdi) vmovdqu %ymm6, -64(%rdi,%rdx) .L_GE33_LE64: vmovdqu %ymm0, (%rdi) vmovdqu %ymm7, -32(%rdi,%rdx) vzeroupper ret .align 2 .L_GE33: vmovdqu (%rsi), %ymm0 vmovdqu -32(%rsi,%rdx), %ymm7 cmp $64, %rdx jbe .L_GE33_LE64 prefetchw 64(%rdi) vmovdqu 32(%rsi), %ymm1 vmovdqu -64(%rsi,%rdx), %ymm6 cmp $128, %rdx jbe .L_GE65_LE128 prefetchw 128(%rdi) vmovdqu 64(%rsi), %ymm2 vmovdqu -96(%rsi,%rdx), %ymm5 cmp $192, %rdx jbe .L_GE129_LE192 prefetchw 192(%rdi) vmovdqu 96(%rsi), %ymm3 vmovdqu -128(%rsi,%rdx), %ymm4 cmp $256, %rdx jbe .L_GE193_LE256 .L_GE257: prefetchw 256(%rdi) // Check if there is an overlap. If there is an overlap then the caller // has a bug since this is undefined behavior. However, for legacy // reasons this behavior is expected by some callers. // // All copies through 256 bytes will operate as a memmove since for // those sizes all reads are performed before any writes. // // This check uses the idea that there is an overlap if // (%rdi < (%rsi + %rdx)) && (%rsi < (%rdi + %rdx)), // or equivalently, there is no overlap if // ((%rsi + %rdx) <= %rdi) || ((%rdi + %rdx) <= %rsi). // // %r9 will be used after .L_ALIGNED_DST_LOOP to calculate how many // bytes remain to be copied. lea (%rsi,%rdx), %r9 cmp %rdi, %r9 jbe .L_NO_OVERLAP lea (%rdi,%rdx), %r8 cmp %rsi, %r8 // This is a forward jump so that the branch predictor will not predict // a memmove. ja .L_MEMMOVE .align 2 .L_NO_OVERLAP: vmovdqu %ymm0, (%rdi) vmovdqu %ymm1, 32(%rdi) vmovdqu %ymm2, 64(%rdi) vmovdqu %ymm3, 96(%rdi) // Align %rdi to a 32 byte boundary. // %rcx = 128 - 31 & %rdi mov $128, %rcx and $31, %rdi sub %rdi, %rcx lea (%rsi,%rcx), %rsi lea (%rax,%rcx), %rdi sub %rcx, %rdx // %r8 is the end condition for the loop. lea -128(%rsi,%rdx), %r8 cmp NON_TEMPORAL_STORE_THRESHOLD, %rdx jae .L_NON_TEMPORAL_LOOP .align 2 .L_ALIGNED_DST_LOOP: prefetchw 128(%rdi) prefetchw 192(%rdi) vmovdqu (%rsi), %ymm0 vmovdqu 32(%rsi), %ymm1 vmovdqu 64(%rsi), %ymm2 vmovdqu 96(%rsi), %ymm3 add $128, %rsi vmovdqa %ymm0, (%rdi) vmovdqa %ymm1, 32(%rdi) vmovdqa %ymm2, 64(%rdi) vmovdqa %ymm3, 96(%rdi) add $128, %rdi cmp %r8, %rsi jb .L_ALIGNED_DST_LOOP .L_ALIGNED_DST_LOOP_END: sub %rsi, %r9 mov %r9, %rdx vmovdqu %ymm4, -128(%rdi,%rdx) vmovdqu %ymm5, -96(%rdi,%rdx) vmovdqu %ymm6, -64(%rdi,%rdx) vmovdqu %ymm7, -32(%rdi,%rdx) vzeroupper ret .align 2 .L_NON_TEMPORAL_LOOP: testb $31, %sil jne .L_ALIGNED_DST_LOOP // This is prefetching the source data unlike ALIGNED_DST_LOOP which // prefetches the destination data. This choice is again informed by // benchmarks. With a non-temporal store the entirety of the cache line // is being written so the previous data can be discarded without being // fetched. prefetchnta 128(%rsi) prefetchnta 196(%rsi) vmovntdqa (%rsi), %ymm0 vmovntdqa 32(%rsi), %ymm1 vmovntdqa 64(%rsi), %ymm2 vmovntdqa 96(%rsi), %ymm3 add $128, %rsi vmovntdq %ymm0, (%rdi) vmovntdq %ymm1, 32(%rdi) vmovntdq %ymm2, 64(%rdi) vmovntdq %ymm3, 96(%rdi) add $128, %rdi cmp %r8, %rsi jb .L_NON_TEMPORAL_LOOP sfence jmp .L_ALIGNED_DST_LOOP_END .L_MEMMOVE: call memmove ret .cfi_endproc .size __folly_memcpy, .-__folly_memcpy #ifdef FOLLY_MEMCPY_IS_MEMCPY .weak memcpy memcpy = __folly_memcpy #endif .ident "GCC: (GNU) 4.8.2" #ifdef __linux__ .section .note.GNU-stack,"",@progbits #endif #endif ================================================ FILE: src/memcpy/impl.S ================================================ #if defined(__APPLE__) .text .global _libc_memcpy .p2align 4, 0x90 _libc_memcpy: jmp _memcpy #else .text .global libc_memcpy .p2align 4, 0x90 libc_memcpy: jmp memcpy #endif #define LABEL(x) .L##x #if defined(__APPLE__) .text .global _asm_memcpy .p2align 5, 0x90 _asm_memcpy: #else .text .global asm_memcpy .p2align 5, 0x90 asm_memcpy: #endif // RDI is the dest // RSI is the src // RDX is length mov %rdi, %rax cmp $64,%rdx ja LABEL(over_64) cmp $16,%rdx jae LABEL(16_to_64) LABEL(below_16): cmp $4,%rdx jbe LABEL(0_to_4) cmp $8,%rdx jbe LABEL(in_4_to_8) LABEL(8_to_16): movq (%rsi), %rcx movq %rcx, (%rax) movq -8(%rsi,%rdx), %rcx movq %rcx, -8(%rax,%rdx) retq LABEL(0_to_4): // Copy the first two bytes: cmp $0,%rdx je LABEL(exit) movb (%rsi), %cl movb %cl, (%rdi) movb -1(%rsi,%rdx), %cl movb %cl, -1(%rdi,%rdx) cmp $2,%rdx jbe LABEL(exit) // Copy the second two bytes, if n > 2. movb 1(%rsi), %cl movb %cl, 1(%rdi) movb 2(%rsi), %cl movb %cl, 2(%rdi) retq LABEL(in_4_to_8): movl (%rsi), %ecx movl %ecx, (%rdi) movl -4(%rsi,%rdx), %ecx movl %ecx, -4(%rdi,%rdx) LABEL(exit): retq LABEL(16_to_64): cmp $32, %rdx jbe LABEL(16_to_32) LABEL(32_to_64): vmovdqu (%rsi), %ymm0 vmovdqu %ymm0, (%rdi) vmovdqu -32(%rsi,%rdx), %ymm0 vmovdqu %ymm0, -32(%rdi,%rdx) vzeroupper retq LABEL(16_to_32): movups (%rsi), %xmm0 movups %xmm0, (%rdi) movups -16(%rsi,%rdx), %xmm0 movups %xmm0, -16(%rdi,%rdx) retq // Handle buffers over 64 bytes: LABEL(over_64): cmp $128, %rdx ja LABEL(over_128) // Copy the last wide word. vmovups -32(%rsi,%rdx), %ymm0 // Handle cases in the range 64 to 128. This is two unconditional // stores (64), 1 conditional store (32), and the one 32 byte store at // the end. vmovups (%rsi), %ymm1 vmovups 32(%rsi), %ymm2 cmp $96, %rdx jbe LABEL(64_to_128_done) vmovups 64(%rsi), %ymm3 vmovups %ymm3, 64(%rax) .align 4 LABEL(64_to_128_done): vmovups %ymm1, (%rax) vmovups %ymm2, 32(%rax) // Store the last wide word. vmovups %ymm0, -32(%rax,%rdx) vzeroupper retq LABEL(over_128): // Compute the last writeable destination. lea -128(%rdx), %rcx xor %r8, %r8 .align 16 LABEL(over_128_copy_loop): vmovdqu (%rsi, %r8), %ymm0 vmovdqu 32(%rsi, %r8), %ymm1 vmovdqu 64(%rsi, %r8), %ymm2 vmovdqu 96(%rsi, %r8), %ymm3 vmovdqu %ymm0, (%rdi, %r8) vmovdqu %ymm1, 32(%rdi, %r8) vmovdqu %ymm2, 64(%rdi, %r8) vmovdqu %ymm3, 96(%rdi, %r8) add $128, %r8 cmp %rcx, %r8 jb LABEL(over_128_copy_loop) // Handle the tail: lea -32(%rdx), %rcx cmp %r8, %rcx jb LABEL(over_128_done) vmovdqu (%rsi, %r8), %ymm0 vmovdqu %ymm0, (%rdi, %r8) add $32, %r8 cmp %r8, %rcx jb LABEL(over_128_done) vmovdqu (%rsi, %r8), %ymm0 vmovdqu %ymm0, (%rdi, %r8) add $32, %r8 cmp %r8, %rcx jb LABEL(over_128_done) vmovdqu (%rsi, %r8), %ymm0 vmovdqu %ymm0, (%rdi, %r8) LABEL(over_128_done): // Copy the last 32 bytes vmovdqu -32(%rsi, %rdx), %ymm0 vmovdqu %ymm0, -32(%rdi, %rdx) vzeroupper retq ================================================ FILE: src/memcpy/impl.c ================================================ #include "types.h" #include #include void *local_memcpy(void *dest, const void *src, size_t n) { char *d = (char *)dest; const char *s = (char *)src; if (n < 5) { if (n == 0) return dest; d[0] = s[0]; d[n - 1] = s[n - 1]; if (n <= 2) return dest; d[1] = s[1]; d[2] = s[2]; return dest; } if (n <= 16) { if (n >= 8) { const char *first_s = s; const char *last_s = s + n - 8; char *first_d = d; char *last_d = d + n - 8; *((u64 *)first_d) = *((u64 *)first_s); *((u64 *)last_d) = *((u64 *)last_s); return dest; } const char *first_s = s; const char *last_s = s + n - 4; char *first_d = d; char *last_d = d + n - 4; *((u32 *)first_d) = *((u32 *)first_s); *((u32 *)last_d) = *((u32 *)last_s); return dest; } if (n <= 32) { const char *first_s = s; const char *last_s = s + n - 16; char *first_d = d; char *last_d = d + n - 16; *((char16 *)first_d) = *((char16 *)first_s); *((char16 *)last_d) = *((char16 *)last_s); return dest; } const char *last_word_s = s + n - 32; char *last_word_d = d + n - 32; // Stamp the 32-byte chunks. do { *((char32 *)d) = *((char32 *)s); d += 32; s += 32; } while (d < last_word_d); // Stamp the last unaligned 32 bytes of the buffer. *((char32 *)last_word_d) = *((char32 *)last_word_s); return dest; } ================================================ FILE: src/memcpy/test_memcpy.cc ================================================ #include #include #include #include "decl.h" #include "utils.h" //////////////////////////////////////////////////////////////////////////////// // This is a small program that checks if some memcpy implementation is correct. //////////////////////////////////////////////////////////////////////////////// #define MAGIC_VALUE0 '#' #define MAGIC_VALUE1 '=' void print_buffer(const char *start, const char *end, char val, const char *ptr) { const char *it = start; while (it != end) { std::cout << *it; it++; } std::cout << "\n"; it = start; while (it != ptr) { std::cout << " "; it++; } std::cout << "^\n"; std::cout << "Filling a buffer of length " << end - start << "."; std::cout << " Expected \"" << val << "\" at index " << ptr - start << std::endl; } void print_buffer_match(const char *start0, const char *start1, size_t len, size_t error_at) { for (size_t i = 0; i < len; i++) { std::cout << start0[i]; } std::cout << "\n"; for (size_t i = 0; i < len; i++) { std::cout << start1[i]; } std::cout << "\n"; for (size_t i = 0; i < error_at; i++) { std::cout << " "; } std::cout << "^\n"; std::cout << "Comparing buffers of length " << len << "."; std::cout << " Invalid value at index " << error_at << "." << std::endl; } // Make sure that the whole buffer, from \p start to \p end, is set to \p val. void assert_uniform_value(const char *start, const char *end, char val) { const char *ptr = start; while (ptr != end) { if (val != *ptr) { print_buffer(start, end, val, ptr); abort(); } ptr++; } } // Make sure that two buffers contain the same memory content. void assert_buffers_match(const char *buff1, const char *buff2, size_t len) { for (size_t i = 0; i < len; i++) { if (buff1[i] != buff2[i]) { print_buffer_match(buff1, buff2, len, i); abort(); } } } void test_impl(memcpy_ty handle, const std::string &name, unsigned chunk_size) { std::vector src(chunk_size + 512); std::vector dest(chunk_size + 512, MAGIC_VALUE0); // Fill the buffer with a running counter of printable chars. for (unsigned i = 0; i < src.size(); i++) { src[i] = 'A' + (i % 26); } // Start copying memory at different offsets. for (int src_offset = 0; src_offset < 32; src_offset++) { for (int dest_offset = 0; dest_offset < 32; dest_offset++) { const char *dest_start = &*dest.begin(); const char *dest_end = &*dest.end(); const char *src_region_start = &src[src_offset]; char *dest_region_start = &dest[dest_offset]; char *dest_region_end = &dest[dest_offset + chunk_size]; void *res = (handle)((void *)dest_region_start, src_region_start, chunk_size); if (res != dest_region_start) { std::cout << "Invalid return value." << std::endl; abort(); } // Check the chunk. assert_buffers_match(dest_region_start, src_region_start, chunk_size); // Check before chunk. assert_uniform_value(dest_start, dest_region_start, MAGIC_VALUE0); // Check after chunk. assert_uniform_value(dest_region_end, dest_end, MAGIC_VALUE0); // Reset the dest buffer: std::fill(dest.begin(), dest.end(), MAGIC_VALUE0); } } } int main(int argc, char **argv) { std::cout << "Testing memcpy... \n"; #define TEST(FUNC, SIZE) test_impl(FUNC, #FUNC, SIZE); for (int i = 0; i < 1024; i++) { TEST(&memcpy, i); TEST(&__folly_memcpy, i); TEST(&local_memcpy, i); TEST(&asm_memcpy, i); } std::cout << "Done.\n"; return 0; } ================================================ FILE: src/memset/CMakeLists.txt ================================================ add_library(mem_shim SHARED shims.c impl.S impl.c ) set_target_properties(mem_shim PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1 ) add_executable(bench_memset bench_memset.cc impl.S impl.c ) add_executable(test_memset test_memset.cc impl.S impl.c ) target_link_libraries(bench_memset PUBLIC) target_link_libraries(test_memset PUBLIC) install(TARGETS bench_memset DESTINATION bin) install(TARGETS test_memset DESTINATION bin) install(TARGETS mem_shim LIBRARY DESTINATION bin) ================================================ FILE: src/memset/bench_memset.cc ================================================ #include #include #include #include #include #include #include "decl.h" #include "utils.h" //////////////////////////////////////////////////////////////////////////////// // This is a small program that compares two memset implementations and records // the output in a csv file. //////////////////////////////////////////////////////////////////////////////// #define ITER (1000L * 1000L * 10L) #define SAMPLES (20) DoomRNG RNG; /// Measure a single implementation \p handle. uint64_t measure(memset_ty handle, unsigned size, unsigned align, unsigned offset, void *ptr) { Stopwatch T; for (unsigned i = 0; i < SAMPLES; i++) { T.start(); for (size_t j = 0; j < ITER; j++) { (handle)(ptr, 0, size); } T.stop(); } return T.get_median(); } // Allocate memory and benchmark each implementation at a specific size \p size. void bench_impl(const std::vector &toTest, unsigned size, unsigned align, unsigned offset) { std::vector memory(size + 256, 0); void *ptr = align_pointer(&memory[0], align, offset); std::cout << size << ", "; for (auto handle : toTest) { u_int64_t res = measure(handle, size, align, offset, ptr); std::cout << res << ", "; } std::cout << std::endl; } /// Try to allocate buffers at random offsets and in random sizes. /// The sizes and the offsets are in the range 0..256. void bench_rand_range(const std::vector &toTest) { std::vector memory(1024, 0); void *ptr = &memory[0]; for (auto handle : toTest) { Stopwatch T; sleep(1); for (unsigned i = 0; i < SAMPLES; i++) { RNG.rand_reset(); T.start(); for (size_t j = 0; j < ITER; j++) { (handle)((char *)ptr + RNG.next_u8_random(), 0, RNG.next_u8_random()); } T.stop(); } std::cout << T.get_median() << ", "; } std::cout << std::endl; } // To measure the call overhead. void *nop(void *s, int c, size_t n) { return s; } int main(int argc, char **argv) { std::cout << std::setprecision(3); std::cout << std::fixed; std::vector toTest = {musl_memset, libc_memset, &memset, local_memset, asm_memset, &nop}; std::cout << "Batches of random sizes:\n"; std::cout << " musl, libc@plt, libc, c_memset, asm_memset, nop,\n"; bench_rand_range(toTest); std::cout << "\nFixed size:\n"; std::cout << "size, musl, libc@plt, libc, c_memset, asm_memset, nop,\n"; for (int i = 0; i < 512; i++) { bench_impl(toTest, i, 16, 0); } return 0; } ================================================ FILE: src/memset/impl.S ================================================ #if defined(__APPLE__) .text .global _libc_memset .p2align 4, 0x90 _libc_memset: jmp _memset #else .text .global libc_memset .p2align 4, 0x90 libc_memset: jmp memset #endif #define LABEL(x) .L##x #if defined(__APPLE__) .text .global _asm_memset .p2align 5, 0x90 _asm_memset: #else .text .global asm_memset .p2align 5, 0x90 asm_memset: #endif // RDI is the buffer // RSI is the value // RDX is length vmovd %esi, %xmm0 vpbroadcastb %xmm0,%ymm0 mov %rdi,%rax cmp $0x40,%rdx jae LABEL(above_64) LABEL(below_64): cmp $0x20, %rdx jb LABEL(below_32) vmovdqu %ymm0,(%rdi) vmovdqu %ymm0,-0x20(%rdi,%rdx) vzeroupper retq LABEL(below_32): cmp $0x10, %rdx jae LABEL(in_16_to_32) LABEL(below_16): cmp $0x4, %rdx jbe LABEL(below_4) LABEL(in_4_to_16): // Scalar stores from this point. vmovq %xmm0, %rsi cmp $0x7, %rdx jbe LABEL(in_4_to_8) // two 8-wide stores, up to 16 bytes. mov %rsi, -0x8(%rdi, %rdx) mov %rsi,(%rdi) vzeroupper retq .align 4 LABEL(below_4): test %rdx, %rdx je LABEL(exit) mov %sil, (%rdi) mov %sil, -0x1(%rdi,%rdx) cmp $0x2, %rdx jbe LABEL(exit) mov %sil, 0x1(%rdi) mov %sil, 0x2(%rdi) mov %rdi,%rax .align 4 LABEL(exit): vzeroupper retq LABEL(in_4_to_8): // two 4-wide stores, upto 8 bytes. mov %esi,-0x4(%rdi,%rdx) mov %esi,(%rdi) vzeroupper retq LABEL(in_16_to_32): vmovups %xmm0,(%rdi) vmovups %xmm0,-0x10(%rdi,%rdx) vzeroupper retq LABEL(above_64): cmp $0xb0, %rdx ja LABEL(above_192) cmp $0x80, %rdx jbe LABEL(in_64_to_128) // Do some work filling unaligned 32bit words. // last_word -> rsi lea -0x20(%rdi,%rdx),%rsi // rcx -> fill pointer. // We have at least 128 bytes to store. vmovdqu %ymm0,(%rdi) vmovdqu %ymm0, 0x20(%rdi) vmovdqu %ymm0, 0x40(%rdi) add $0x60,%rdi .align 8 LABEL(fill_32): vmovdqu %ymm0,(%rdi) add $0x20,%rdi cmp %rdi,%rsi ja LABEL(fill_32) // Stamp the last unaligned store. vmovdqu %ymm0,(%rsi) vzeroupper retq LABEL(in_64_to_128): // last_word -> rsi vmovdqu %ymm0,(%rdi) vmovdqu %ymm0, 0x20(%rdi) vmovdqu %ymm0,-0x40(%rdi,%rdx) vmovdqu %ymm0,-0x20(%rdi,%rdx) vzeroupper retq LABEL(above_192): // rdi is the buffer address // rsi is the value // rdx is length // Store the first unaligned 32 bytes. vmovdqu %ymm0,(%rdi) // The first aligned word is stored in %rsi. mov %rdi,%rsi and $0xffffffffffffffe0,%rsi lea 0x20(%rsi),%rsi // Compute the address of the last unaligned word into rdi. lea -0x20(%rdx), %rdx add %rdx, %rdi // Check if we can do a full 5x32B stamp. lea 0xa0(%rsi),%rcx cmp %rcx, %rdi jb LABEL(stamp_4) .align 8 LABEL(fill_192): vmovdqa %ymm0,(%rsi) vmovdqa %ymm0,0x20(%rsi) vmovdqa %ymm0,0x40(%rsi) vmovdqa %ymm0,0x60(%rsi) vmovdqa %ymm0,0x80(%rsi) add $0xa0, %rsi lea 0xa0(%rsi),%rcx cmp %rcx, %rdi ja LABEL(fill_192) LABEL(fill_192_tail): cmp %rsi, %rdi jb LABEL(fill_192_done) vmovdqa %ymm0, (%rsi) lea 0x20(%rsi),%rcx cmp %rcx, %rdi jb LABEL(fill_192_done) vmovdqa %ymm0, 0x20(%rsi) lea 0x40(%rsi),%rcx cmp %rcx, %rdi jb LABEL(fill_192_done) vmovdqa %ymm0, 0x40(%rsi) lea 0x60(%rsi),%rcx cmp %rcx, %rdi jb LABEL(fill_192_done) vmovdqa %ymm0, 0x60(%rsi) LABEL(last_wide_store): lea 0x80(%rsi),%rcx cmp %rcx, %rdi jb LABEL(fill_192_done) vmovdqa %ymm0, 0x80(%rsi) LABEL(fill_192_done): // Stamp the last word. vmovdqu %ymm0,(%rdi) vzeroupper ret LABEL(stamp_4): vmovdqa %ymm0,(%rsi) vmovdqa %ymm0,0x20(%rsi) vmovdqa %ymm0,0x40(%rsi) vmovdqa %ymm0,0x60(%rsi) jmp LABEL(last_wide_store) ================================================ FILE: src/memset/impl.c ================================================ #include "types.h" #include #include // Handle memsets of sizes 0..32 static inline void *small_memset(void *s, int c, size_t n) { if (n < 5) { if (n == 0) return s; char *p = s; p[0] = c; p[n - 1] = c; if (n <= 2) return s; p[1] = c; p[2] = c; return s; } if (n <= 16) { uint64_t val8 = ((uint64_t)0x0101010101010101L * ((uint8_t)c)); if (n >= 8) { char *first = s; char *last = s + n - 8; *((u64 *)first) = val8; *((u64 *)last) = val8; return s; } uint32_t val4 = val8; char *first = s; char *last = s + n - 4; *((u32 *)first) = val4; *((u32 *)last) = val4; return s; } char X = c; char *p = s; char16 val16 = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}; char *last = s + n - 16; *((char16 *)last) = val16; *((char16 *)p) = val16; return s; } static inline void *huge_memset(void *s, int c, size_t n) { char *p = s; char X = c; char32 val32 = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}; // Stamp the first 32byte store. *((char32 *)p) = val32; char *first_aligned = p + 32 - ((uint64_t)p % 32); char *buffer_end = p + n; char *last_word = buffer_end - 32; // Align the next stores. p = first_aligned; // Unroll the body of the loop to increase parallelism. while (p + (32 * 5) < buffer_end) { *((char32a *)p) = val32; p += 32; *((char32a *)p) = val32; p += 32; *((char32a *)p) = val32; p += 32; *((char32a *)p) = val32; p += 32; *((char32a *)p) = val32; p += 32; } // Complete the last few iterations: #define TRY_STAMP_32_BYTES \ if (p < last_word) { \ *((char32a *)p) = val32; \ p += 32; \ } TRY_STAMP_32_BYTES TRY_STAMP_32_BYTES TRY_STAMP_32_BYTES TRY_STAMP_32_BYTES // Stamp the last unaligned word. *((char32 *)last_word) = val32; return s; } void *local_memset(void *s, int c, size_t n) { char *p = s; char X = c; if (n < 32) { return small_memset(s, c, n); } if (n > 160) { return huge_memset(s, c, n); } char32 val32 = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}; char *last_word = s + n - 32; // Stamp the 32-byte chunks. do { *((char32 *)p) = val32; p += 32; } while (p < last_word); // Stamp the last unaligned 32 bytes of the buffer. *((char32 *)last_word) = val32; return s; } /// This a memset implementation that was copied from musl. We only use it for /// benchmarking. void *musl_memset(void *dest, int c, size_t n) { unsigned char *s = dest; size_t k; /* Fill head and tail with minimal branching. Each * conditional ensures that all the subsequently used * offsets are well-defined and in the dest region. */ if (!n) return dest; s[0] = c; s[n - 1] = c; if (n <= 2) return dest; s[1] = c; s[2] = c; s[n - 2] = c; s[n - 3] = c; if (n <= 6) return dest; s[3] = c; s[n - 4] = c; if (n <= 8) return dest; /* Advance pointer to align it at a 4-byte boundary, * and truncate n to a multiple of 4. The previous code * already took care of any head/tail that get cut off * by the alignment. */ k = -(uintptr_t)s & 3; s += k; n -= k; n &= -4; #ifdef __GNUC__ typedef uint32_t __attribute__((__may_alias__)) u32; typedef uint64_t __attribute__((__may_alias__)) u64; u32 c32 = ((u32)-1) / 255 * (unsigned char)c; /* In preparation to copy 32 bytes at a time, aligned on * an 8-byte bounary, fill head/tail up to 28 bytes each. * As in the initial byte-based head/tail fill, each * conditional below ensures that the subsequent offsets * are valid (e.g. !(n<=24) implies n>=28). */ *(u32 *)(s + 0) = c32; *(u32 *)(s + n - 4) = c32; if (n <= 8) return dest; *(u32 *)(s + 4) = c32; *(u32 *)(s + 8) = c32; *(u32 *)(s + n - 12) = c32; *(u32 *)(s + n - 8) = c32; if (n <= 24) return dest; *(u32 *)(s + 12) = c32; *(u32 *)(s + 16) = c32; *(u32 *)(s + 20) = c32; *(u32 *)(s + 24) = c32; *(u32 *)(s + n - 28) = c32; *(u32 *)(s + n - 24) = c32; *(u32 *)(s + n - 20) = c32; *(u32 *)(s + n - 16) = c32; /* Align to a multiple of 8 so we can fill 64 bits at a time, * and avoid writing the same bytes twice as much as is * practical without introducing additional branching. */ k = 24 + ((uintptr_t)s & 4); s += k; n -= k; /* If this loop is reached, 28 tail bytes have already been * filled, so any remainder when n drops below 32 can be * safely ignored. */ u64 c64 = c32 | ((u64)c32 << 32); for (; n >= 32; n -= 32, s += 32) { *(u64 *)(s + 0) = c64; *(u64 *)(s + 8) = c64; *(u64 *)(s + 16) = c64; *(u64 *)(s + 24) = c64; } #else /* Pure C fallback with no aliasing violations. */ for (; n; n--, s++) *s = c; #endif return dest; } ================================================ FILE: src/memset/shims.c ================================================ #include "decl.h" //////////////////////////////////////////////////////////////////////////////// /// This is a small utility that swaps the builtin call to memset with the /// local implementation of memset, implemented in this project. /// The shared object can be loaded using LD_PRELOAD (on Linux) or /// DYLD_INSERT_LIBRARIES (on Mac). //////////////////////////////////////////////////////////////////////////////// void *memset(void *s, int c, size_t n) { return local_memset(s, c, n); } ================================================ FILE: src/memset/test_memset.cc ================================================ #include #include #include #include "decl.h" #include "utils.h" //////////////////////////////////////////////////////////////////////////////// // This is a small program that checks if some memset implementation is correct. // The tool currently checks libc, musl and the local implementation. //////////////////////////////////////////////////////////////////////////////// #define MAGIC_VALUE0 'X' #define MAGIC_VALUE1 'O' void print_buffer(const char *start, const char *end, char val, const char *ptr) { const char *it = start; while (it != end) { std::cout << *it; it++; } std::cout << "\n"; it = start; while (it != ptr) { std::cout << " "; it++; } std::cout << "^\n"; std::cout << "Filling a buffer of length " << end - start << "."; std::cout << " Expected \"" << val << "\" at index " << ptr - start << "\n"; } void assert_uniform_value(const char *start, const char *end, char val) { const char *ptr = start; while (ptr != end) { if (val != *ptr) { print_buffer(start, end, val, ptr); fflush(stdout); abort(); } ptr++; } } void test_impl(memset_ty handle, const std::string &name, unsigned chunk_size) { std::vector memory(chunk_size + 512, MAGIC_VALUE0); // Start mem-setting the array at different offsets. for (int offset = 0; offset < 128; offset++) { const char *buffer_start = &*memory.begin(); const char *buffer_end = &*memory.end(); const char *region_start = &memory[offset]; const char *region_end = region_start + chunk_size; assert_uniform_value(buffer_start, buffer_end, MAGIC_VALUE0); (handle)((void *)region_start, MAGIC_VALUE1, chunk_size); // Check the chunk. assert_uniform_value(region_start, region_end, MAGIC_VALUE1); // Check before chunk. assert_uniform_value(buffer_start, region_start, MAGIC_VALUE0); // Check after chunk. assert_uniform_value(region_end, buffer_end, MAGIC_VALUE0); // Reset the buffer: std::fill(memory.begin(), memory.end(), MAGIC_VALUE0); assert_uniform_value(buffer_start, buffer_end, MAGIC_VALUE0); } } int main(int argc, char **argv) { std::cout << "Testing memset... \n"; #define TEST(FUNC, SIZE) test_impl(FUNC, #FUNC, SIZE); for (int i = 0; i < 1024; i++) { TEST(libc_memset, i); TEST(local_memset, i); TEST(musl_memset, i); TEST(asm_memset, i); } std::cout << "Done.\n"; return 0; } ================================================ FILE: src/utils/CMakeLists.txt ================================================ add_library(hist_tool SHARED hist_tool.c ) set_target_properties(hist_tool PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION 1 ) target_compile_options(hist_tool PRIVATE "-fno-builtin") install(TARGETS hist_tool LIBRARY DESTINATION bin) ================================================ FILE: src/utils/hist_tool.c ================================================ #include #include #include #include //////////////////////////////////////////////////////////////////////////////// /// This is a small utility that records calls to some methods and creates a /// histogram of the lengths of calls to memset. It prints the histogram when /// the program is terminated. The shared object can be loaded using LD_PRELOAD /// (on Linux) or DYLD_INSERT_LIBRARIES (on Mac). //////////////////////////////////////////////////////////////////////////////// uint32_t memset_len_dist[32] = { 0, }; uint32_t memcpy_len_dist[32] = { 0, }; uint32_t align_dist[32] = { 0, }; const int tab32[32] = {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31}; int log2_32(uint32_t value) { value |= value >> 1; value |= value >> 2; value |= value >> 4; value |= value >> 8; value |= value >> 16; return tab32[(uint32_t)(value * 0x07C4ACDD) >> 27]; } void __attribute__((destructor)) print_hitograms() { FILE *ff = fopen("/tmp/hist.txt", "a+"); if (!ff) { return; } pid_t pid = getpid(); fprintf(ff, "Histogram for (%d):\n", pid); fprintf(ff, "size, memset, memcpy, alignment:\n"); for (int i = 0; i < 32; i++) { fprintf(ff, "%d, %d, %d, %d,\n", i, memset_len_dist[i], memcpy_len_dist[i], align_dist[i]); } fclose(ff); } void *memcpy(void *dest, const void *src, size_t len) { memcpy_len_dist[log2_32(len)]++; align_dist[(unsigned long)dest % 32]++; align_dist[(unsigned long)src % 32]++; char *d = (char *)dest; char *s = (char *)src; for (size_t i = 0; i < len; i++) { d[i] = s[i]; } return dest; } void *memset(void *s, int c, size_t len) { memset_len_dist[log2_32(len)]++; align_dist[(unsigned long)s % 32]++; char *p = s; for (int i = 0; i < len; i++) { p[i] = c; } return s; }