Repository: WojciechMula/sse4-strstr Branch: master Commit: 9cdc4b6df817 Files: 55 Total size: 186.5 KB Directory structure: gitextract_jd4noqk0/ ├── .gitignore ├── LICENSE ├── Makefile ├── README.rst ├── aarch64-strstr-v2.cpp ├── avx2-naive-strstr.cpp ├── avx2-naive-strstr64.cpp ├── avx2-naive-unrolled-strstr.cpp ├── avx2-strstr-v2-clang-specific.cpp ├── avx2-strstr-v2.cpp ├── avx2-strstr.cpp ├── avx512bw-strstr-v2.cpp ├── avx512bw-strstr-v3.cpp ├── avx512f-strstr-v2.cpp ├── avx512f-strstr.cpp ├── common.h ├── data/ │ └── placeholder ├── fixed-memcmp.cpp ├── make_words.sh ├── neon-strstr-v2.cpp ├── original/ │ ├── sse4_strstr-test.py │ └── sse4_strstr.c ├── results/ │ ├── armv7-32bit-gcc4.9.2.txt │ ├── armv8-64bit-clang3.8.0.txt │ ├── bulldozer-fx-8510-gcc4.8.4-sse.txt │ ├── cascadelake-Gold-5217-gcc-7.4.0-avx512bw.txt │ ├── haswell-i7-4770-gcc5.4.1-avx2.txt │ ├── knights-landing-7210-gcc5.3.0-avx512f.txt │ ├── postprocess.py │ ├── skylake-i7-6700-gcc5.4.1-avx2.txt │ ├── skylake-i9-7900-gcc-5.4.1-avx512bw.txt │ └── westmere-m540-gcc6.2.0-sse4.txt ├── scalar.cpp ├── src/ │ ├── all.h │ ├── all_procedures.cpp │ ├── application_base.cpp │ ├── benchmark.cpp │ ├── benchmark.h │ ├── speedup.cpp │ ├── unittests.cpp │ └── validate.cpp ├── sse-naive-strstr.cpp ├── sse2-needle4.cpp ├── sse2-strstr.cpp ├── sse4-strstr-unrolled.cpp ├── sse4-strstr.cpp ├── sse4.2-strstr.cpp ├── swar32-strstr-v2.cpp ├── swar64-strstr-v2.cpp └── utils/ ├── ansi.cpp ├── avx2.cpp ├── avx512.cpp ├── bits.cpp ├── neon.cpp └── sse.cpp ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ speedup_sse4 benchmark_sse4 unittests_sse4 validate_sse4 speedup_avx2 benchmark_avx2 unittests_avx2 validate_avx2 speedup_avx512f benchmark_avx512f unittests_avx512f validate_avx512f unittests_avx512bw benchmark_avx512bw validate_avx512bw speedup_avx512bw speedup_arm unittests_arm validate_arm speedup_aarch64 unittests_aarch64 validate_aarch64 data/i386.txt data/words tags ================================================ FILE: LICENSE ================================================ Copyright (c) 2008-2016, Wojciech Muła All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: Makefile ================================================ .PHONY: all clean compile_intel FLAGS=-std=c++11 -O3 -Wall -Wextra -pedantic -I. $(CXXFLAGS) FLAGS_INTEL=$(FLAGS) -DHAVE_SSE_INSTRUCTIONS FLAGS_SSE4=$(FLAGS_INTEL) -msse4.2 FLAGS_AVX2=$(FLAGS_INTEL) -mavx2 -DHAVE_AVX2_INSTRUCTIONS FLAGS_AVX512F=$(FLAGS_INTEL) -mavx512f -DHAVE_AVX2_INSTRUCTIONS -DHAVE_AVX512F_INSTRUCTIONS FLAGS_AVX512BW=$(FLAGS_INTEL) -mavx512bw -DHAVE_AVX2_INSTRUCTIONS -DHAVE_AVX512F_INSTRUCTIONS -DHAVE_AVX512BW_INSTRUCTIONS FLAGS_ARM=$(FLAGS) -mfpu=neon -DHAVE_NEON_INSTRUCTIONS FLAGS_AARCH64=$(FLAGS) -DHAVE_NEON_INSTRUCTIONS -DHAVE_AARCH64_ARCHITECTURE DEPS=utils/ansi.cpp utils/bits.cpp common.h fixed-memcmp.cpp DEPS_SCALAR=swar64-strstr-v2.cpp swar32-strstr-v2.cpp scalar.cpp DEPS_SSE4=sse4-strstr.cpp sse4-strstr-unrolled.cpp sse4.2-strstr.cpp sse2-strstr.cpp sse-naive-strstr.cpp sse2-needle4.cpp utils/sse.cpp $(DEPS) $(DEPS_SCALAR) DEPS_AVX2=avx2-*.cpp utils/avx2.cpp $(DEPS_SSE4) DEPS_AVX512F=avx512f-*.cpp utils/avx512.cpp $(DEPS_AVX2) DEPS_AVX512BW=avx512bw-*.cpp utils/avx512.cpp $(DEPS_AVX512F) DEPS_ARM=neon-strstr-v2.cpp $(DEPS) $(DEPS_SCALAR) DEPS_AARCH64=aarch64-strstr-v2.cpp $(DEPS_ARM) ALL_INTEL=\ validate_sse4 \ speedup_sse4 \ benchmark_sse4 \ unittests_sse4 \ validate_avx2 \ speedup_avx2 \ benchmark_avx2 \ unittests_avx2 \ validate_avx512f \ speedup_avx512f \ benchmark_avx512f \ unittests_avx512f \ speedup_avx512bw \ benchmark_avx512bw \ validate_avx512bw \ unittests_avx512bw \ ALL_ARM=\ validate_arm \ unittests_arm \ speedup_arm ALL_AARCH64=\ validate_aarch64 \ unittests_aarch64 \ speedup_aarch64 ALL=$(ALL_INTEL) $(ALL_ARM) $(ALL_AARCH64) all: @echo "select target test_ARCH or run_ARCH" @echo @echo "test_ARCH runs unit and validation tests" @echo "run_ARCH runs performance tests" @echo @echo "ARCH might be:" @echo "* sse4" @echo "* avx2" @echo "* avx512f" @echo "* avx512bw" @echo "* arm" @echo "* aarch64" build_intel: $(ALL_INTEL) build_arm: $(ALL_ARM) build_aarch64: $(ALL_AARCH64) UNITTESTS_DEPS=src/unittests.cpp src/all_procedures.cpp VALIDATE_DEPS=src/validate.cpp src/application_base.cpp src/all_procedures.cpp SPEEDUP_DEPS=src/speedup.cpp src/application_base.cpp src/all_procedures.cpp BENCHMARK_DEPS=src/benchmark.cpp src/benchmark.h src/application_base.cpp src/all_procedures.cpp validate_sse4: $(VALIDATE_DEPS) $(DEPS_SSE4) $(CXX) $(FLAGS_SSE4) src/validate.cpp -o $@ speedup_sse4: $(SPEEDUP_DEPS) $(DEPS_SSE4) $(CXX) $(FLAGS_SSE4) -DNDEBUG src/speedup.cpp -o $@ benchmark_sse4: $(BENCHMARK_DEPS) $(DEPS_SSE4) $(CXX) $(FLAGS_SSE4) -DNDEBUG src/benchmark.cpp -o $@ unittests_sse4: $(UNITTESTS_DEPS) $(DEPS_SSE4) $(CXX) $(FLAGS_SSE4) src/unittests.cpp -o $@ validate_avx2: $(VALIDATE_DEPS) $(DEPS_AVX2) $(CXX) $(FLAGS_AVX2) src/validate.cpp -o $@ speedup_avx2: $(SPEEDUP_DEPS) $(DEPS_AVX2) $(CXX) $(FLAGS_AVX2) -DNDEBUG src/speedup.cpp -o $@ benchmark_avx2: $(BENCHMARK_DEPS) $(DEPS_SSE4) $(CXX) $(FLAGS_AVX2) -DNDEBUG src/benchmark.cpp -o $@ unittests_avx2: $(UNITTESTS_DEPS) $(DEPS_AVX2) $(CXX) $(FLAGS_AVX2) src/unittests.cpp -o $@ validate_avx512f: $(VALIDATE_DEPS) $(DEPS_AVX512F) $(CXX) $(FLAGS_AVX512F) src/validate.cpp -o $@ benchmark_avx512f: $(BENCHMARK_DEPS) $(DEPS_SSE4) $(CXX) $(FLAGS_AVX512F) -DNDEBUG src/benchmark.cpp -o $@ speedup_avx512f: $(SPEEDUP_DEPS) $(DEPS_AVX512F) $(CXX) $(FLAGS_AVX512F) -DNDEBUG src/speedup.cpp -o $@ unittests_avx512f: $(UNITTESTS_DEPS) $(DEPS_AVX512F) $(CXX) $(FLAGS_AVX512F) src/unittests.cpp -o $@ validate_avx512bw: $(VALIDATE_DEPS) $(DEPS_AVX512BW) $(CXX) $(FLAGS_AVX512BW) src/validate.cpp -o $@ speedup_avx512bw: $(SPEEDUP_DEPS) $(DEPS_AVX512BW) $(CXX) $(FLAGS_AVX512BW) -DNDEBUG src/speedup.cpp -o $@ benchmark_avx512bw: $(BENCHMARK_DEPS) $(DEPS_SSE4) $(CXX) $(FLAGS_AVX512BW) -DNDEBUG src/benchmark.cpp -o $@ unittests_avx512bw: $(UNITTESTS_DEPS) $(DEPS_AVX512BW) $(CXX) $(FLAGS_AVX512BW) src/unittests.cpp -o $@ validate_arm: $(VALIDATE_DEPS) $(DEPS_ARM) $(CXX) $(FLAGS_ARM) src/validate.cpp -o $@ speedup_arm: $(SPEEDUP_DEPS) $(DEPS_ARM) $(CXX) $(FLAGS_ARM) -DNDEBUG src/speedup.cpp -o $@ unittests_arm: $(UNITTESTS_DEPS) $(DEPS_ARM) $(CXX) $(FLAGS_ARM) src/unittests.cpp -o $@ validate_aarch64: $(VALIDATE_DEPS) $(DEPS_AARCH64) $(CXX) $(FLAGS_AARCH64) src/validate.cpp -o $@ speedup_aarch64: $(SPEEDUP_DEPS) $(DEPS_AARCH64) $(CXX) $(FLAGS_AARCH64) -DNDEBUG src/speedup.cpp -o $@ unittests_aarch64: $(UNITTESTS_DEPS) $(DEPS_ARM) $(CXX) $(FLAGS_AARCH64) src/unittests.cpp -o $@ data/i386.txt: wget http://css.csail.mit.edu/6.858/2013/readings/i386.txt mv i386.txt data/i386.txt data/words: data/i386.txt sh make_words.sh $^ $@ test_sse4: unittests_sse4 validate_sse4 data/words data/i386.txt ./unittests_sse4 ./validate_sse4 data/i386.txt data/words run_sse4: speedup_sse4 data/words data/i386.txt ./speedup_sse4 data/i386.txt data/words test_avx2: unittests_avx2 validate_avx2 data/words data/i386.txt ./unittests_avx2 ./validate_avx2 data/i386.txt data/words run_avx2: speedup_avx2 data/words data/i386.txt ./speedup_avx2 data/i386.txt data/words test_avx512f: unittests_avx512f validate_avx512f data/words data/i386.txt ./unittests_avx512f ./validate_avx512f data/i386.txt data/words run_avx512f: speedup_avx512f data/words data/i386.txt ./speedup_avx512f data/i386.txt data/words run_avx512bw: speedup_avx512bw data/words data/i386.txt ./speedup_avx512bw data/i386.txt data/words test_avx512bw: unittests_avx512bw validate_avx512bw data/words data/i386.txt ./unittests_avx512bw ./validate_avx512bw data/i386.txt data/words test_arm: unittests_arm validate_arm data/words data/i386.txt ./unittests_arm ./validate_arm data/i386.txt data/words run_arm: speedup_arm data/words data/i386.txt # my Raspberry Pi is slow, repeat count = 1 is enough ./$< data/i386.txt data/words 1 test_aarch64: unittests_aarch64 validate_aarch64 data/words data/i386.txt ./unittests_aarch64 ./validate_aarch64 data/i386.txt data/words run_aarch64: speedup_aarch64 data/words data/i386.txt ./$< data/i386.txt data/words 1 compile_intel: $(ALL_INTEL) clean: rm -f $(ALL) ================================================ FILE: README.rst ================================================ ================================================================================ SIMD-friendly algorithms for substring searching ================================================================================ Sample programs for article "SIMD-friendly algorithms for substring searching" (http://0x80.pl/articles/simd-strfind.html). The **root directory** contains C++11 procedures implemented using intrinsics for SSE, SSE4, AVX2, AVX512F, AVX512BW and ARM Neon (both ARMv7 and ARMv8). The subdirectory **original** contains 32-bit programs with inline assembly, written in 2008 for another article__. __ http://0x80.pl/articles/sse4_substring_locate.html Usage ------------------------------------------------------------------------ To run unit and validation tests type ``make test_ARCH``, to run performance tests type ``make run_ARCH``. Value ``ARCH`` selectes the CPU architecture: * sse4, * avx2, * avx512f, * avx512bw, * arm, * aarch64. Performance results ------------------------------------------------------------------------ The subdirectory ``results`` contains raw timings from various computers. ================================================ FILE: aarch64-strstr-v2.cpp ================================================ size_t FORCE_INLINE aarch64_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { assert(k > 0); assert(n > 0); const uint8x16_t first = vdupq_n_u8(needle[0]); const uint8x16_t last = vdupq_n_u8(needle[k - 1]); const uint8_t* ptr = reinterpret_cast(s); for (size_t i = 0; i < n; i += 16) { const uint8x16_t block_first = vld1q_u8(ptr + i); const uint8x16_t block_last = vld1q_u8(ptr + i + k - 1); const uint8x16_t eq_first = vceqq_u8(first, block_first); const uint8x16_t eq_last = vceqq_u8(last, block_last); const uint8x16_t pred_16 = vandq_u8(eq_first, eq_last); uint64_t mask; mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 0); if (mask) { for (int j=0; j < 8; j++) { if ((mask & 0xff) && (memcmp(s + i + j + 1, needle + 1, k - 2) == 0)) { return i + j; } mask >>= 8; } } mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 1); if (mask) { for (int j=0; j < 8; j++) { if ((mask & 0xff) && (memcmp(s + i + j + 8 + 1, needle + 1, k - 2) == 0)) { return i + j + 8; } mask >>= 8; } } } return std::string::npos; } // ------------------------------------------------------------------------ template size_t FORCE_INLINE aarch64_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { assert(k > 0); assert(n > 0); const uint8x16_t first = vdupq_n_u8(needle[0]); const uint8x16_t last = vdupq_n_u8(needle[k - 1]); const uint8_t* ptr = reinterpret_cast(s); for (size_t i = 0; i < n; i += 16) { const uint8x16_t block_first = vld1q_u8(ptr + i); const uint8x16_t block_last = vld1q_u8(ptr + i + k - 1); const uint8x16_t eq_first = vceqq_u8(first, block_first); const uint8x16_t eq_last = vceqq_u8(last, block_last); const uint8x16_t pred_16 = vandq_u8(eq_first, eq_last); uint64_t mask; int j; mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 0); j = 0; while (mask) { if ((mask & 0xff) && (memcmp_fun(s + i + j + 1, needle + 1))) { return i + j; } mask >>= 8; j += 1; } mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 1); j = 0; while (mask) { if ((mask & 0xff) && (memcmp_fun(s + i + j + 8 + 1, needle + 1))) { return i + j + 8; } mask >>= 8; j += 1; } } return std::string::npos; } // ------------------------------------------------------------------------ size_t aarch64_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } switch (k) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, needle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: result = aarch64_strstr_memcmp<2>(s, n, needle, always_true); break; case 3: result = aarch64_strstr_memcmp<3>(s, n, needle, memcmp1); break; case 4: result = aarch64_strstr_memcmp<4>(s, n, needle, memcmp2); break; case 5: result = aarch64_strstr_memcmp<5>(s, n, needle, memcmp4); break; case 6: result = aarch64_strstr_memcmp<6>(s, n, needle, memcmp4); break; case 7: result = aarch64_strstr_memcmp<7>(s, n, needle, memcmp5); break; case 8: result = aarch64_strstr_memcmp<8>(s, n, needle, memcmp6); break; case 9: result = aarch64_strstr_memcmp<9>(s, n, needle, memcmp8); break; case 10: result = aarch64_strstr_memcmp<10>(s, n, needle, memcmp8); break; case 11: result = aarch64_strstr_memcmp<11>(s, n, needle, memcmp9); break; case 12: result = aarch64_strstr_memcmp<12>(s, n, needle, memcmp10); break; default: result = aarch64_strstr_anysize(s, n, needle, k); break; } if (result <= n - k) { return result; } else { return std::string::npos; } } // ------------------------------------------------------------------------ size_t aarch64_strstr_v2(const std::string& s, const std::string& needle) { return aarch64_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: avx2-naive-strstr.cpp ================================================ // Method descibed in https://arxiv.org/pdf/1612.01506.pdf // // Implementation by Daniel Lemire // https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/extra/simd/substring/substring.c size_t FORCE_INLINE avx2_naive_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { assert(k > 0); assert(n > 0); if (n == k) { return (memcmp(s, needle, k) == 0) ? 0 : std::string::npos; } for (size_t i = 0; i < n - k + 1; i += 32) { uint32_t found = 0xffffffff; for (size_t j = 0; (j < k) && (found != 0) ; ++j) { const __m256i textvector = _mm256_loadu_si256((const __m256i *)(s + i + j)); const __m256i needlevector = _mm256_set1_epi8(needle[j]); uint32_t bitmask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(textvector, needlevector)); found = found & bitmask; } if (found != 0) { return i + __builtin_ctz(found); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t avx2_naive_strstr(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } result = avx2_naive_strstr_anysize(s, n, needle, k); if (result <= n - k) { return result; } else { return std::string::npos; } } // ------------------------------------------------------------------------ size_t avx2_naive_strstr(const std::string& s, const std::string& needle) { return avx2_naive_strstr(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: avx2-naive-strstr64.cpp ================================================ // Method descibed in https://arxiv.org/pdf/1612.01506.pdf // // Implementation by Daniel Lemire // https://github.com/WojciechMula/sse4-strstr/issues/2 size_t FORCE_INLINE avx2_naive_strstr_anysize64(const char* s, size_t n, const char* needle, size_t k) { assert(k > 0); assert(n > 0); const __m256i first = _mm256_set1_epi8(needle[0]); const __m256i last = _mm256_set1_epi8(needle[k - 1]); for (size_t i = 0; i < n; i += 64) { const __m256i block_first1 = _mm256_loadu_si256((const __m256i*)(s + i)); const __m256i block_last1 = _mm256_loadu_si256((const __m256i*)(s + i + k - 1)); const __m256i block_first2 = _mm256_loadu_si256((const __m256i*)(s + i + 32)); const __m256i block_last2 = _mm256_loadu_si256((const __m256i*)(s + i + k - 1 + 32)); const __m256i eq_first1 = _mm256_cmpeq_epi8(first, block_first1); const __m256i eq_last1 = _mm256_cmpeq_epi8(last, block_last1); const __m256i eq_first2 = _mm256_cmpeq_epi8(first, block_first2); const __m256i eq_last2 = _mm256_cmpeq_epi8(last, block_last2); const uint32_t mask1 = _mm256_movemask_epi8(_mm256_and_si256(eq_first1, eq_last1)); const uint32_t mask2 = _mm256_movemask_epi8(_mm256_and_si256(eq_first2, eq_last2)); uint64_t mask = mask1 | ((uint64_t)mask2 << 32); while (mask != 0) { const int bitpos = __builtin_ctzll(mask); if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t avx2_naive_strstr64(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } result = avx2_naive_strstr_anysize64(s, n, needle, k); if (result <= n - k) { return result; } else { return std::string::npos; } } // ------------------------------------------------------------------------ size_t avx2_naive_strstr64(const std::string& s, const std::string& needle) { return avx2_naive_strstr64(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: avx2-naive-unrolled-strstr.cpp ================================================ // Method described in https://arxiv.org/pdf/1612.01506.pdf // // Implementation by Daniel Lemire size_t FORCE_INLINE avx2_naive_strstr_unrolled_anysize(const char* s, size_t n, const char* needle, size_t k) { // assert(n % 32 == 0); // deliberately commented out // todo: fix it so we can handle variable-length inputs and // can catch matches at the end of the data. for (size_t i = 0; i < n - k; i += 32) { uint32_t found = 0xFFFFFFFF; // 32 1-bits size_t j = 0; for (; (j + 3 < k) && (found != 0) ; j += 4) { __m256i textvector1 = _mm256_loadu_si256((const __m256i *)(s + i + j)); __m256i needlevector1 = _mm256_set1_epi8(needle[j]); __m256i textvector2 = _mm256_loadu_si256((const __m256i *)(s + i + j + 1)); __m256i needlevector2 = _mm256_set1_epi8(needle[j + 1]); __m256i cmp1 = _mm256_cmpeq_epi8(textvector1, needlevector1); __m256i cmp2 = _mm256_cmpeq_epi8(textvector2, needlevector2); __m256i textvector3 = _mm256_loadu_si256((const __m256i *)(s + i + j + 2)); __m256i needlevector3 = _mm256_set1_epi8(needle[j + 2]); __m256i textvector4 = _mm256_loadu_si256((const __m256i *)(s + i + j + 3)); __m256i needlevector4 = _mm256_set1_epi8(needle[j + 3]); __m256i cmp3 = _mm256_cmpeq_epi8(textvector3, needlevector3); __m256i cmp4 = _mm256_cmpeq_epi8(textvector4, needlevector4); __m256i cmp12 = _mm256_and_si256(cmp1,cmp2); __m256i cmp34 = _mm256_and_si256(cmp3,cmp4); uint32_t bitmask = _mm256_movemask_epi8(_mm256_and_si256(cmp12,cmp34)); found = found & bitmask; } for (; (j < k) && (found != 0) ; ++j) { __m256i textvector = _mm256_loadu_si256((const __m256i *)(s + i + j)); __m256i needlevector = _mm256_set1_epi8(needle[j]); uint32_t bitmask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(textvector, needlevector)); found = found & bitmask; } if(found != 0) { // got a match... maybe return i + __builtin_ctz(found); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t avx2_naive_unrolled_strstr(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } result = avx2_naive_strstr_unrolled_anysize(s, n, needle, k); if (result <= n - k) { return result; } else { return std::string::npos; } } // ------------------------------------------------------------------------ size_t avx2_naive_unrolled_strstr(const std::string& s, const std::string& needle) { return avx2_naive_unrolled_strstr(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: avx2-strstr-v2-clang-specific.cpp ================================================ /* The following templates implement the loop, where K is a template parameter. for (unsigned i=1; i < K; i++) { const __m256i substring = _mm256_alignr_epi8(next1, curr, i); eq = _mm256_and_si256(eq, _mm256_cmpeq_epi8(substring, broadcasted[i])); } Clang complains that the loop parameter `i` is a variable and it cannot be applied as a parameter _mm256_alignr_epi8. GCC somehow deals with it. */ #ifdef __clang__ template struct inner_loop_aux; template struct inner_loop_aux { void operator()(__m256i& eq, const __m256i& next1, const __m256i& curr, const __m256i (&broadcasted)[K]) { const __m256i substring = _mm256_alignr_epi8(next1, curr, i); eq = _mm256_and_si256(eq, _mm256_cmpeq_epi8(substring, broadcasted[i])); inner_loop_aux()(eq, next1, curr, broadcasted); } }; template struct inner_loop_aux { void operator()(__m256i&, const __m256i&, const __m256i&, const __m256i (&)[K]) { // nop } }; template struct inner_loop { void operator()(__m256i& eq, const __m256i& next1, const __m256i& curr, const __m256i (&broadcasted)[K]) { static_assert(K > 0, "wrong value"); inner_loop_aux()(eq, next1, curr, broadcasted); } }; #endif ================================================ FILE: avx2-strstr-v2.cpp ================================================ // implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html size_t FORCE_INLINE avx2_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { assert(k > 0); assert(n > 0); const __m256i first = _mm256_set1_epi8(needle[0]); const __m256i last = _mm256_set1_epi8(needle[k - 1]); for (size_t i = 0; i < n; i += 32) { const __m256i block_first = _mm256_loadu_si256(reinterpret_cast(s + i)); const __m256i block_last = _mm256_loadu_si256(reinterpret_cast(s + i + k - 1)); const __m256i eq_first = _mm256_cmpeq_epi8(first, block_first); const __m256i eq_last = _mm256_cmpeq_epi8(last, block_last); uint32_t mask = _mm256_movemask_epi8(_mm256_and_si256(eq_first, eq_last)); while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask); if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } #include "avx2-strstr-v2-clang-specific.cpp" template size_t FORCE_INLINE avx2_strstr_eq(const char* s, size_t n, const char* needle) { static_assert(K > 0 && K < 16, "K must be in range [1..15]"); assert(n > 0); __m256i broadcasted[K]; for (unsigned i=0; i < K; i++) { broadcasted[i] = _mm256_set1_epi8(needle[i]); } __m256i curr = _mm256_loadu_si256(reinterpret_cast(s)); for (size_t i = 0; i < n; i += 32) { const __m256i next = _mm256_loadu_si256(reinterpret_cast(s + i + 32)); __m256i eq = _mm256_cmpeq_epi8(curr, broadcasted[0]); // AVX2 palignr works on 128-bit lanes, thus some extra work is needed // // curr = [a, b] (2 x 128 bit) // next = [c, d] // substring = [palignr(b, a, i), palignr(c, b, i)] __m256i next1; next1 = _mm256_inserti128_si256(next1, _mm256_extracti128_si256(curr, 1), 0); // b next1 = _mm256_inserti128_si256(next1, _mm256_extracti128_si256(next, 0), 1); // c #ifndef __clang__ for (unsigned i=1; i < K; i++) { const __m256i substring = _mm256_alignr_epi8(next1, curr, i); eq = _mm256_and_si256(eq, _mm256_cmpeq_epi8(substring, broadcasted[i])); } #else inner_loop()(eq, next1, curr, broadcasted); #endif curr = next; const uint32_t mask = _mm256_movemask_epi8(eq); if (mask != 0) { return i + bits::get_first_bit_set(mask); } } return std::string::npos; } template size_t FORCE_INLINE avx2_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { assert(k > 0); assert(n > 0); const __m256i first = _mm256_set1_epi8(needle[0]); const __m256i last = _mm256_set1_epi8(needle[k - 1]); for (size_t i = 0; i < n; i += 32) { const __m256i block_first = _mm256_loadu_si256(reinterpret_cast(s + i)); const __m256i block_last = _mm256_loadu_si256(reinterpret_cast(s + i + k - 1)); const __m256i eq_first = _mm256_cmpeq_epi8(first, block_first); const __m256i eq_last = _mm256_cmpeq_epi8(last, block_last); uint32_t mask = _mm256_movemask_epi8(_mm256_and_si256(eq_first, eq_last)); while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask); if (memcmp_fun(s + i + bitpos + 1, needle + 1)) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t avx2_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } switch (k) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, needle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: result = avx2_strstr_eq<2>(s, n, needle); break; case 3: result = avx2_strstr_memcmp<3>(s, n, needle, memcmp1); break; case 4: result = avx2_strstr_memcmp<4>(s, n, needle, memcmp2); break; case 5: // Note: use memcmp4 rather memcmp3, as the last character // of needle is already proven to be equal result = avx2_strstr_memcmp<5>(s, n, needle, memcmp4); break; case 6: result = avx2_strstr_memcmp<6>(s, n, needle, memcmp4); break; case 7: result = avx2_strstr_memcmp<7>(s, n, needle, memcmp5); break; case 8: result = avx2_strstr_memcmp<8>(s, n, needle, memcmp6); break; case 9: // Note: use memcmp8 rather memcmp7 for the same reason as above. result = avx2_strstr_memcmp<9>(s, n, needle, memcmp8); break; case 10: result = avx2_strstr_memcmp<10>(s, n, needle, memcmp8); break; case 11: result = avx2_strstr_memcmp<11>(s, n, needle, memcmp9); break; case 12: result = avx2_strstr_memcmp<12>(s, n, needle, memcmp10); break; default: result = avx2_strstr_anysize(s, n, needle, k); break; } if (result <= n - k) { return result; } else { return std::string::npos; } } // ------------------------------------------------------------------------ size_t avx2_strstr_v2(const std::string& s, const std::string& needle) { return avx2_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: avx2-strstr.cpp ================================================ size_t avx2_strstr_long(const char* s, size_t n, const char* neddle, size_t neddle_size) { assert(neddle_size > 4); assert(n > 0); const uint32_t prefix32 = *reinterpret_cast(neddle); const __m256i prefix = _mm256_set1_epi32(prefix32); const __m256i zeros = _mm256_setzero_si256(); const __m256i permute = _mm256_setr_epi32( 0, 1, 2, 0, 2, 3, 4, 0 ); for (size_t i = 0; i < n; i += 16) { const __m256i in = _mm256_loadu_si256(reinterpret_cast(s + i)); /* [00|01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31] lane | boundary [00|01|02|03|04|05|06|07|08|09|10|11|??|??|??|??|08|09|10|11|12|13|14|15|16|17|18|19|??|??|??|??] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ */ const __m256i data = _mm256_permutevar8x32_epi32(in, permute); const __m256i result = _mm256_mpsadbw_epu8(data, prefix, 0); const __m256i cmp = _mm256_cmpeq_epi16(result, zeros); uint32_t mask = _mm256_movemask_epi8(cmp) & 0x55555555u; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; if (memcmp(s + i + bitpos + 4, neddle + 4, neddle_size - 4) == 0) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t avx2_strstr_len4(const char* s, size_t n, const char* neddle) { assert(n > 0); const uint32_t prefix32 = *reinterpret_cast(neddle); const __m256i prefix = _mm256_set1_epi32(prefix32); const __m256i zeros = _mm256_setzero_si256(); const __m256i permute = _mm256_setr_epi32( 0, 1, 2, 0, 2, 3, 4, 0 ); for (size_t i = 0; i < n; i += 16) { const __m256i in = _mm256_loadu_si256(reinterpret_cast(s + i)); const __m256i data = _mm256_permutevar8x32_epi32(in, permute); const __m256i result = _mm256_mpsadbw_epu8(data, prefix, 0); const __m256i cmp = _mm256_cmpeq_epi16(result, zeros); const uint32_t mask = _mm256_movemask_epi8(cmp) & 0x55555555u; if (mask != 0) { return i + bits::get_first_bit_set(mask)/2; } } return std::string::npos; } // ------------------------------------------------------------------------ size_t avx2_strstr(const char* s, size_t n, const char* neddle, size_t neddle_size) { size_t result = std::string::npos; if (n < neddle_size) { return result; } switch (neddle_size) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, neddle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: case 3: { const char* res = reinterpret_cast(strstr(s, neddle)); return (res != nullptr) ? res - s : std::string::npos; } case 4: result = avx2_strstr_len4(s, n, neddle); break; default: result = avx2_strstr_long(s, n, neddle, neddle_size); break; } if (result <= n - neddle_size) { return result; } else { return std::string::npos; } } // -------------------------------------------------- size_t avx2_strstr(const std::string& s, const std::string& neddle) { return avx2_strstr(s.data(), s.size(), neddle.data(), neddle.size()); } ================================================ FILE: avx512bw-strstr-v2.cpp ================================================ // implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html size_t avx512bw_strstr_v2_anysize(const char* string, size_t n, const char* needle, size_t k) { assert(n > 0); assert(k > 0); const __m512i first = _mm512_set1_epi8(needle[0]); const __m512i last = _mm512_set1_epi8(needle[k - 1]); char* haystack = const_cast(string); char* end = haystack + n; for (/**/; haystack < end; haystack += 64) { const __m512i block_first = _mm512_loadu_si512(haystack + 0); const __m512i block_last = _mm512_loadu_si512(haystack + k - 1); uint64_t mask = _mm512_cmpeq_epi8_mask(block_first, first) & _mm512_cmpeq_epi8_mask(block_last, last); while (mask != 0) { const uint64_t bitpos = bits::get_first_bit_set(mask); const char* s = reinterpret_cast(haystack); if (memcmp(s + bitpos + 1, needle + 1, k - 2) == 0) { return (s - string) + bitpos; } mask = bits::clear_leftmost_set(mask); } } return size_t(-1); } template size_t avx512bw_strstr_v2_memcmp(const char* string, size_t n, const char* needle, MEMCMP memeq_fun) { assert(n > 0); assert(k > 0); const __m512i first = _mm512_set1_epi8(needle[0]); const __m512i last = _mm512_set1_epi8(needle[k - 1]); char* haystack = const_cast(string); char* end = haystack + n; for (/**/; haystack < end; haystack += 64) { const __m512i block_first = _mm512_loadu_si512(haystack + 0); const __m512i block_last = _mm512_loadu_si512(haystack + k - 1); uint64_t mask = _mm512_cmpeq_epi8_mask(block_first, first) & _mm512_cmpeq_epi8_mask(block_last, last); while (mask != 0) { const uint64_t bitpos = bits::get_first_bit_set(mask); const char* s = reinterpret_cast(haystack); if (memeq_fun(s + bitpos + 1, needle + 1)) { return (s - string) + bitpos; } mask = bits::clear_leftmost_set(mask); } } return size_t(-1); } // ------------------------------------------------------------------------ size_t avx512bw_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } switch (k) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, needle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: result = avx512bw_strstr_v2_memcmp<2>(s, n, needle, always_true); break; case 3: result = avx512bw_strstr_v2_memcmp<3>(s, n, needle, memcmp1); break; case 4: result = avx512bw_strstr_v2_memcmp<4>(s, n, needle, memcmp2); break; case 5: result = avx512bw_strstr_v2_memcmp<5>(s, n, needle, memcmp3); break; case 6: result = avx512bw_strstr_v2_memcmp<6>(s, n, needle, memcmp4); break; case 7: result = avx512bw_strstr_v2_memcmp<7>(s, n, needle, memcmp5); break; case 8: result = avx512bw_strstr_v2_memcmp<8>(s, n, needle, memcmp6); break; case 9: result = avx512bw_strstr_v2_memcmp<9>(s, n, needle, memcmp7); break; case 10: result = avx512bw_strstr_v2_memcmp<10>(s, n, needle, memcmp8); break; case 11: result = avx512bw_strstr_v2_memcmp<11>(s, n, needle, memcmp9); break; case 12: result = avx512bw_strstr_v2_memcmp<12>(s, n, needle, memcmp10); break; default: result = avx512bw_strstr_v2_anysize(s, n, needle, k); break; } if (result <= n - k) { return result; } else { return std::string::npos; } } // -------------------------------------------------- size_t avx512bw_strstr_v2(const std::string& s, const std::string& needle) { return avx512bw_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: avx512bw-strstr-v3.cpp ================================================ // implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html size_t avx512bw_strstr_v3_anysize(const char* string, size_t n, const char* needle, size_t k) { assert(n > 0); assert(k > 0); const __m512i first = _mm512_set1_epi8(needle[0]); const __m512i last = _mm512_set1_epi8(needle[k - 1]); char* haystack = const_cast(string); char* end = haystack + n; for (/**/; haystack < end; haystack += 64) { const __m512i block_first = _mm512_loadu_si512(haystack + 0); const __mmask64 first_eq = _mm512_cmpeq_epi8_mask(block_first, first); if (first_eq == 0) continue; const __m512i block_last = _mm512_loadu_si512(haystack + k - 1); uint64_t mask = _mm512_mask_cmpeq_epi8_mask(first_eq, block_last, last); while (mask != 0) { const uint64_t bitpos = bits::get_first_bit_set(mask); const char* s = reinterpret_cast(haystack); if (memcmp(s + bitpos + 1, needle + 1, k - 2) == 0) { return (s - string) + bitpos; } mask = bits::clear_leftmost_set(mask); } } return size_t(-1); } template size_t avx512bw_strstr_v3_memcmp(const char* string, size_t n, const char* needle, MEMCMP memeq_fun) { assert(n > 0); assert(k > 0); const __m512i first = _mm512_set1_epi8(needle[0]); const __m512i last = _mm512_set1_epi8(needle[k - 1]); char* haystack = const_cast(string); char* end = haystack + n; for (/**/; haystack < end; haystack += 64) { const __m512i block_first = _mm512_loadu_si512(haystack + 0); const __mmask64 first_eq = _mm512_cmpeq_epi8_mask(block_first, first); if (first_eq == 0) continue; const __m512i block_last = _mm512_loadu_si512(haystack + k - 1); uint64_t mask = _mm512_mask_cmpeq_epi8_mask(first_eq, block_last, last); while (mask != 0) { const uint64_t bitpos = bits::get_first_bit_set(mask); const char* s = reinterpret_cast(haystack); if (memeq_fun(s + bitpos + 1, needle + 1)) { return (s - string) + bitpos; } mask = bits::clear_leftmost_set(mask); } } return size_t(-1); } // ------------------------------------------------------------------------ size_t avx512bw_strstr_v3(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } switch (k) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, needle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: result = avx512bw_strstr_v3_memcmp<2>(s, n, needle, always_true); break; case 3: result = avx512bw_strstr_v3_memcmp<3>(s, n, needle, memcmp1); break; case 4: result = avx512bw_strstr_v3_memcmp<4>(s, n, needle, memcmp2); break; case 5: result = avx512bw_strstr_v3_memcmp<5>(s, n, needle, memcmp3); break; case 6: result = avx512bw_strstr_v3_memcmp<6>(s, n, needle, memcmp4); break; case 7: result = avx512bw_strstr_v3_memcmp<7>(s, n, needle, memcmp5); break; case 8: result = avx512bw_strstr_v3_memcmp<8>(s, n, needle, memcmp6); break; case 9: result = avx512bw_strstr_v3_memcmp<9>(s, n, needle, memcmp7); break; case 10: result = avx512bw_strstr_v3_memcmp<10>(s, n, needle, memcmp8); break; case 11: result = avx512bw_strstr_v3_memcmp<11>(s, n, needle, memcmp9); break; case 12: result = avx512bw_strstr_v3_memcmp<12>(s, n, needle, memcmp10); break; default: result = avx512bw_strstr_v3_anysize(s, n, needle, k); break; } if (result <= n - k) { return result; } else { return std::string::npos; } } // -------------------------------------------------- size_t avx512bw_strstr_v3(const std::string& s, const std::string& needle) { return avx512bw_strstr_v3(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: avx512f-strstr-v2.cpp ================================================ // implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html __mmask16 FORCE_INLINE zero_byte_mask(const __m512i v) { const __m512i v01 = _mm512_set1_epi8(0x01); const __m512i v80 = _mm512_set1_epi8(int8_t(0x80)); const __m512i v1 = _mm512_sub_epi32(v, v01); // tmp1 = (v - 0x01010101) & ~v & 0x80808080 const __m512i tmp1 = _mm512_ternarylogic_epi32(v1, v, v80, 0x20); return _mm512_test_epi32_mask(tmp1, tmp1); } size_t avx512f_strstr_v2_anysize(const char* string, size_t n, const char* needle, size_t k) { assert(n > 0); assert(k > 0); const __m512i first = _mm512_set1_epi8(needle[0]); const __m512i last = _mm512_set1_epi8(needle[k - 1]); char* haystack = const_cast(string); char* end = haystack + n; for (/**/; haystack < end; haystack += 64) { const __m512i block_first = _mm512_loadu_si512(haystack + 0); const __m512i block_last = _mm512_loadu_si512(haystack + k - 1); #if 0 const __m512i first_zeros = _mm512_xor_si512(block_first, first); const __m512i last_zeros = _mm512_xor_si512(block_last, last); const __m512i zeros = _mm512_or_si512(first_zeros, last_zeros); #else const __m512i first_zeros = _mm512_xor_si512(block_first, first); /* first_zeros | block_last | last | first_zeros | (block_last ^ last) ------------+------------+------+------------------------------------ 0 | 0 | 0 | 0 0 | 0 | 1 | 1 0 | 1 | 0 | 1 0 | 1 | 1 | 0 1 | 0 | 0 | 1 1 | 0 | 1 | 1 1 | 1 | 0 | 1 1 | 1 | 1 | 1 */ const __m512i zeros = _mm512_ternarylogic_epi32(first_zeros, block_last, last, 0xf6); #endif uint32_t mask = zero_byte_mask(zeros); while (mask) { const uint64_t p = __builtin_ctz(mask); if (memcmp(haystack + 4*p + 0, needle, k) == 0) { return (haystack - string) + 4*p + 0; } if (memcmp(haystack + 4*p + 1, needle, k) == 0) { return (haystack - string) + 4*p + 1; } if (memcmp(haystack + 4*p + 2, needle, k) == 0) { return (haystack - string) + 4*p + 2; } if (memcmp(haystack + 4*p + 3, needle, k) == 0) { return (haystack - string) + 4*p + 3; } mask = bits::clear_leftmost_set(mask); } } return size_t(-1); } template size_t avx512f_strstr_v2_memcmp(const char* string, size_t n, const char* needle, MEMCMP memeq_fun) { assert(n > 0); assert(k > 0); const __m512i first = _mm512_set1_epi8(needle[0]); const __m512i last = _mm512_set1_epi8(needle[k - 1]); char* haystack = const_cast(string); char* end = haystack + n; for (/**/; haystack < end; haystack += 64) { const __m512i block_first = _mm512_loadu_si512(haystack + 0); const __m512i block_last = _mm512_loadu_si512(haystack + k - 1); const __m512i first_zeros = _mm512_xor_si512(block_first, first); const __m512i zeros = _mm512_ternarylogic_epi32(first_zeros, block_last, last, 0xf6); uint32_t mask = zero_byte_mask(zeros); while (mask) { const uint64_t p = __builtin_ctz(mask); if (memeq_fun(haystack + 4*p + 0, needle)) { return (haystack - string) + 4*p + 0; } if (memeq_fun(haystack + 4*p + 1, needle)) { return (haystack - string) + 4*p + 1; } if (memeq_fun(haystack + 4*p + 2, needle)) { return (haystack - string) + 4*p + 2; } if (memeq_fun(haystack + 4*p + 3, needle)) { return (haystack - string) + 4*p + 3; } mask = bits::clear_leftmost_set(mask); } } return size_t(-1); } // ------------------------------------------------------------------------ size_t avx512f_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } switch (k) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, needle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: result = avx512f_strstr_v2_memcmp<2>(s, n, needle, memcmp2); break; case 3: result = avx512f_strstr_v2_memcmp<3>(s, n, needle, memcmp3); break; case 4: result = avx512f_strstr_v2_memcmp<4>(s, n, needle, memcmp4); break; case 5: result = avx512f_strstr_v2_memcmp<5>(s, n, needle, memcmp5); break; case 6: result = avx512f_strstr_v2_memcmp<6>(s, n, needle, memcmp6); break; case 7: result = avx512f_strstr_v2_memcmp<7>(s, n, needle, memcmp7); break; case 8: result = avx512f_strstr_v2_memcmp<8>(s, n, needle, memcmp8); break; case 9: result = avx512f_strstr_v2_memcmp<9>(s, n, needle, memcmp9); break; case 10: result = avx512f_strstr_v2_memcmp<10>(s, n, needle, memcmp10); break; case 11: result = avx512f_strstr_v2_memcmp<11>(s, n, needle, memcmp11); break; case 12: result = avx512f_strstr_v2_memcmp<12>(s, n, needle, memcmp12); break; default: result = avx512f_strstr_v2_anysize(s, n, needle, k); break; } if (result <= n - k) { return result; } else { return std::string::npos; } } // -------------------------------------------------- size_t avx512f_strstr_v2(const std::string& s, const std::string& needle) { return avx512f_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: avx512f-strstr.cpp ================================================ /* string - pointer to the string n - string length in bytes needle - pointer to another string n - needle length in bytes */ size_t avx512f_strstr_long(const char* string, size_t n, const char* needle, size_t k) { assert(n > 0); assert(k > 4); __m512i curr; __m512i next; __m512i v0, v1, v2, v3; char* haystack = const_cast(string); char* last = haystack + n; const uint32_t prf = *(uint32_t*)needle; // the first 4 bytes of needle const __m512i prefix = _mm512_set1_epi32(prf); next = _mm512_loadu_si512(haystack); for (/**/; haystack < last; haystack += 64) { curr = next; next = _mm512_loadu_si512(haystack + 64); const __m512i shft = _mm512_alignr_epi32(next, curr, 1); v0 = curr; { const __m512i t1 = _mm512_srli_epi32(curr, 8); const __m512i t2 = _mm512_slli_epi32(shft, 24); v1 = _mm512_or_si512(t1, t2); } { const __m512i t1 = _mm512_srli_epi32(curr, 16); const __m512i t2 = _mm512_slli_epi32(shft, 16); v2 = _mm512_or_si512(t1, t2); } { const __m512i t1 = _mm512_srli_epi32(curr, 24); const __m512i t2 = _mm512_slli_epi32(shft, 8); v3 = _mm512_or_si512(t1, t2); } uint16_t m0 = _mm512_cmpeq_epi32_mask(v0, prefix); uint16_t m1 = _mm512_cmpeq_epi32_mask(v1, prefix); uint16_t m2 = _mm512_cmpeq_epi32_mask(v2, prefix); uint16_t m3 = _mm512_cmpeq_epi32_mask(v3, prefix); int index = 64; while (m0 | m1 | m2 | m3) { if (m0) { int pos = __builtin_ctz(m0) * 4 + 0; m0 = m0 & (m0 - 1); if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) { index = pos; } } if (m1) { int pos = __builtin_ctz(m1) * 4 + 1; m1 = m1 & (m1 - 1); if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) { index = pos; } } if (m2) { int pos = __builtin_ctz(m2) * 4 + 2; m2 = m2 & (m2 - 1); if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) { index = pos; } } if (m3) { int pos = __builtin_ctz(m3) * 4 + 3; m3 = m3 & (m3 - 1); if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) { index = pos; } } } if (index < 64) { return (haystack - string) + index; } } return size_t(-1); } // ------------------------------------------------------------------------ size_t avx512f_strstr_eq4(const char* string, size_t n, const char* needle) { assert(n > 0); __m512i curr; __m512i next; __m512i v0, v1, v2, v3; char* haystack = const_cast(string); char* last = haystack + n; const uint32_t prf = *(uint32_t*)needle; // the first 4 bytes of needle const __m512i prefix = _mm512_set1_epi32(prf); next = _mm512_loadu_si512(haystack); for (/**/; haystack < last; haystack += 64) { curr = next; next = _mm512_loadu_si512(haystack + 64); const __m512i shft = _mm512_alignr_epi32(next, curr, 1); v0 = curr; { const __m512i t1 = _mm512_srli_epi32(curr, 8); const __m512i t2 = _mm512_slli_epi32(shft, 24); v1 = _mm512_or_si512(t1, t2); } { const __m512i t1 = _mm512_srli_epi32(curr, 16); const __m512i t2 = _mm512_slli_epi32(shft, 16); v2 = _mm512_or_si512(t1, t2); } { const __m512i t1 = _mm512_srli_epi32(curr, 24); const __m512i t2 = _mm512_slli_epi32(shft, 8); v3 = _mm512_or_si512(t1, t2); } uint16_t m0 = _mm512_cmpeq_epi32_mask(v0, prefix); uint16_t m1 = _mm512_cmpeq_epi32_mask(v1, prefix); uint16_t m2 = _mm512_cmpeq_epi32_mask(v2, prefix); uint16_t m3 = _mm512_cmpeq_epi32_mask(v3, prefix); int index = 64; if (m0) { int pos = __builtin_ctz(m0) * 4 + 0; if (pos < index) { index = pos; } } if (m1) { int pos = __builtin_ctz(m1) * 4 + 1; if (pos < index) { index = pos; } } if (m2) { int pos = __builtin_ctz(m2) * 4 + 2; if (pos < index) { index = pos; } } if (m3) { int pos = __builtin_ctz(m3) * 4 + 3; if (pos < index) { index = pos; } } if (index < 64) { return (haystack - string) + index; } assert(m0 == 0 && m1 == 0 && m2 == 0 && m3 == 0); } return size_t(-1); } // ------------------------------------------------------------------------ size_t avx512f_strstr(const char* s, size_t n, const char* needle, size_t needle_size) { size_t result = std::string::npos; if (n < needle_size) { return result; } switch (needle_size) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, needle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: case 3: { const char* res = reinterpret_cast(strstr(s, needle)); return (res != nullptr) ? res - s : std::string::npos; } case 4: result = avx512f_strstr_eq4(s, n, needle); break; default: result = avx512f_strstr_long(s, n, needle, needle_size); break; } if (result <= n - needle_size) { return result; } else { return std::string::npos; } } // -------------------------------------------------- size_t avx512f_strstr(const std::string& s, const std::string& needle) { return avx512f_strstr(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: common.h ================================================ #pragma once #define FORCE_INLINE inline __attribute__((always_inline)) #define MAYBE_UNUSED inline __attribute__((unused)) #if defined(HAVE_NEON_INSTRUCTIONS) # include # define USE_SIMPLE_MEMCMP // for fixed-memcmp.cpp #else # include #endif ================================================ FILE: data/placeholder ================================================ placeholder ================================================ FILE: fixed-memcmp.cpp ================================================ // #define USE_SIMPLE_MEMCMP // when defined simpler expressions are used namespace { MAYBE_UNUSED bool always_true(const char*, const char*) { return true; } MAYBE_UNUSED bool memcmp1(const char* a, const char* b) { return a[0] == b[0]; } MAYBE_UNUSED bool memcmp2(const char* a, const char* b) { const uint16_t A = *reinterpret_cast(a); const uint16_t B = *reinterpret_cast(b); return A == B; } MAYBE_UNUSED bool memcmp3(const char* a, const char* b) { #ifdef USE_SIMPLE_MEMCMP return memcmp2(a, b) && memcmp1(a + 2, b + 2); #else const uint32_t A = *reinterpret_cast(a); const uint32_t B = *reinterpret_cast(b); return (A & 0x00ffffff) == (B & 0x00ffffff); #endif } MAYBE_UNUSED bool memcmp4(const char* a, const char* b) { const uint32_t A = *reinterpret_cast(a); const uint32_t B = *reinterpret_cast(b); return A == B; } MAYBE_UNUSED bool memcmp5(const char* a, const char* b) { #ifdef USE_SIMPLE_MEMCMP return memcmp4(a, b) && memcmp1(a + 4, b + 4); #else const uint64_t A = *reinterpret_cast(a); const uint64_t B = *reinterpret_cast(b); return ((A ^ B) & 0x000000fffffffffflu) == 0; #endif } MAYBE_UNUSED bool memcmp6(const char* a, const char* b) { #ifdef USE_SIMPLE_MEMCMP return memcmp4(a, b) && memcmp2(a + 4, b + 4); #else const uint64_t A = *reinterpret_cast(a); const uint64_t B = *reinterpret_cast(b); return ((A ^ B) & 0x0000fffffffffffflu) == 0; #endif } MAYBE_UNUSED bool memcmp7(const char* a, const char* b) { #ifdef USE_SIMPLE_MEMCMP return memcmp4(a, b) && memcmp3(a + 4, b + 4); #else const uint64_t A = *reinterpret_cast(a); const uint64_t B = *reinterpret_cast(b); return ((A ^ B) & 0x00fffffffffffffflu) == 0; #endif } MAYBE_UNUSED bool memcmp8(const char* a, const char* b) { const uint64_t A = *reinterpret_cast(a); const uint64_t B = *reinterpret_cast(b); return A == B; } MAYBE_UNUSED bool memcmp9(const char* a, const char* b) { const uint64_t A = *reinterpret_cast(a); const uint64_t B = *reinterpret_cast(b); return (A == B) & (a[8] == b[8]); } MAYBE_UNUSED bool memcmp10(const char* a, const char* b) { const uint64_t Aq = *reinterpret_cast(a); const uint64_t Bq = *reinterpret_cast(b); const uint16_t Aw = *reinterpret_cast(a + 8); const uint16_t Bw = *reinterpret_cast(b + 8); return (Aq == Bq) & (Aw == Bw); } MAYBE_UNUSED bool memcmp11(const char* a, const char* b) { #ifdef USE_SIMPLE_MEMCMP return memcmp8(a, b) && memcmp3(a + 8, b + 8); #else const uint64_t Aq = *reinterpret_cast(a); const uint64_t Bq = *reinterpret_cast(b); const uint32_t Ad = *reinterpret_cast(a + 8); const uint32_t Bd = *reinterpret_cast(b + 8); return (Aq == Bq) & ((Ad & 0x00ffffff) == (Bd & 0x00ffffff)); #endif } MAYBE_UNUSED bool memcmp12(const char* a, const char* b) { const uint64_t Aq = *reinterpret_cast(a); const uint64_t Bq = *reinterpret_cast(b); const uint32_t Ad = *reinterpret_cast(a + 8); const uint32_t Bd = *reinterpret_cast(b + 8); return (Aq == Bq) & (Ad == Bd); } } ================================================ FILE: make_words.sh ================================================ # split words cat $1 \ | tr -s -c "a-zA-Z" "\n" \ | sort -u \ > $2 ================================================ FILE: neon-strstr-v2.cpp ================================================ size_t FORCE_INLINE neon_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { assert(k > 0); assert(n > 0); const uint8x16_t first = vdupq_n_u8(needle[0]); const uint8x16_t last = vdupq_n_u8(needle[k - 1]); const uint8x8_t half = vdup_n_u8(0x0f); const uint8_t* ptr = reinterpret_cast(s); union { uint8_t tmp[8]; uint32_t word[2]; }; for (size_t i = 0; i < n; i += 16) { const uint8x16_t block_first = vld1q_u8(ptr + i); const uint8x16_t block_last = vld1q_u8(ptr + i + k - 1); const uint8x16_t eq_first = vceqq_u8(first, block_first); const uint8x16_t eq_last = vceqq_u8(last, block_last); const uint8x16_t pred_16 = vandq_u8(eq_first, eq_last); const uint8x8_t pred_8 = vbsl_u8(half, vget_low_u8(pred_16), vget_high_u8(pred_16)); vst1_u8(tmp, pred_8); if ((word[0] | word[1]) == 0) { continue; } #if 0 for (int j=0; j < 8; j++) { if ((tmp[j] & 0x0f) && (memcmp(s + i + j + 1, needle + 1, k - 2) == 0)) { return i + j; } } for (int j=0; j < 8; j++) { if ((tmp[j] & 0xf0) && (memcmp(s + i + j + 1 + 8, needle + 1, k - 2) == 0)) { return i + j + 8; } } #else // the above loops unrolled uint32_t v; #define RETURN_IF_EQ(MASK, SHIFT) \ if ((v & MASK) && memcmp(s + i + SHIFT + 1, needle + 1, k - 2) == 0) { \ return i + SHIFT; \ } #define COMPARE(MASK, WORD_IDX, SHIFT) \ v = word[WORD_IDX]; \ RETURN_IF_EQ(MASK, SHIFT + 0); \ v >>= 8; \ RETURN_IF_EQ(MASK, SHIFT + 1); \ v >>= 8; \ RETURN_IF_EQ(MASK, SHIFT + 2); \ v >>= 8; \ RETURN_IF_EQ(MASK, SHIFT + 3); COMPARE(0x0f, 0, 0); COMPARE(0x0f, 1, 4); COMPARE(0xf0, 0, 8); COMPARE(0xf0, 1, 12); #undef RETURN_IF_EQ #undef COMPARE #endif } return std::string::npos; } // ------------------------------------------------------------------------ template size_t FORCE_INLINE neon_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { assert(k > 0); assert(n > 0); const uint8x16_t first = vdupq_n_u8(needle[0]); const uint8x16_t last = vdupq_n_u8(needle[k - 1]); const uint8x8_t half = vdup_n_u8(0x0f); const uint8_t* ptr = reinterpret_cast(s); union { uint8_t tmp[8]; uint32_t word[2]; }; for (size_t i = 0; i < n; i += 16) { const uint8x16_t block_first = vld1q_u8(ptr + i); const uint8x16_t block_last = vld1q_u8(ptr + i + k - 1); const uint8x16_t eq_first = vceqq_u8(first, block_first); const uint8x16_t eq_last = vceqq_u8(last, block_last); const uint8x16_t pred_16 = vandq_u8(eq_first, eq_last); const uint8x8_t pred_8 = vbsl_u8(half, vget_low_u8(pred_16), vget_high_u8(pred_16)); vst1_u8(tmp, pred_8); if ((word[0] | word[1]) == 0) { continue; } #if 0 for (int j=0; j < 8; j++) { if ((tmp[j] & 0x0f) && memcmp_fun(s + i + j + 1, needle + 1)) { return i + j; } } for (int j=0; j < 8; j++) { if ((tmp[j] & 0xf0) && memcmp_fun(s + i + j + 1 + 8, needle + 1)) { return i + j + 8; } } #else // the above loops unrolled uint32_t v; #define RETURN_IF_EQ(MASK, SHIFT) \ if ((v & MASK) && memcmp_fun(s + i + SHIFT + 1, needle + 1)) { \ return i + SHIFT; \ } #define COMPARE(MASK, WORD_IDX, SHIFT) \ v = word[WORD_IDX]; \ RETURN_IF_EQ(MASK, SHIFT + 0); \ v >>= 8; \ RETURN_IF_EQ(MASK, SHIFT + 1); \ v >>= 8; \ RETURN_IF_EQ(MASK, SHIFT + 2); \ v >>= 8; \ RETURN_IF_EQ(MASK, SHIFT + 3); COMPARE(0x0f, 0, 0); COMPARE(0x0f, 1, 4); COMPARE(0xf0, 0, 8); COMPARE(0xf0, 1, 12); #undef RETURN_IF_EQ #undef COMPARE #endif } return std::string::npos; } // ------------------------------------------------------------------------ size_t neon_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } switch (k) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, needle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: result = neon_strstr_memcmp<2>(s, n, needle, always_true); break; case 3: result = neon_strstr_memcmp<3>(s, n, needle, memcmp1); break; case 4: result = neon_strstr_memcmp<4>(s, n, needle, memcmp2); break; case 5: result = neon_strstr_memcmp<5>(s, n, needle, memcmp4); break; case 6: result = neon_strstr_memcmp<6>(s, n, needle, memcmp4); break; case 7: result = neon_strstr_memcmp<7>(s, n, needle, memcmp5); break; case 8: result = neon_strstr_memcmp<8>(s, n, needle, memcmp6); break; case 9: result = neon_strstr_memcmp<9>(s, n, needle, memcmp8); break; case 10: result = neon_strstr_memcmp<10>(s, n, needle, memcmp8); break; case 11: result = neon_strstr_memcmp<11>(s, n, needle, memcmp9); break; case 12: result = neon_strstr_memcmp<12>(s, n, needle, memcmp10); break; default: result = neon_strstr_anysize(s, n, needle, k); break; } if (result <= n - k) { return result; } else { return std::string::npos; } } // ------------------------------------------------------------------------ size_t neon_strstr_v2(const std::string& s, const std::string& needle) { return neon_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: original/sse4_strstr-test.py ================================================ import sys, os, random filename = "" try: filename = sys.argv[1] string = open(filename, "r").read() except: print "can't open '%s'" % filename sys.exit(1) try: random.seed(int(sys.argv[3])) except: pass def time_command(command): os.system('/usr/bin/time -o /tmp/measure -f "%U" ' + command) f = open("/tmp/measure", "r") t = float(f.read()) f.close() return t def time(command1, command2, iters=10): while True: t1 = time_command(command1.replace("__iters__", str(iters))) if t1 > 1: t2 = time_command(command2.replace("__iters__", str(iters))) return iters, t1, t2 else: iters *= 10 def compare(filename, wordpos, word, wordlen): word = word.replace("%", "%%") cmd1 = './a.out "%s" libc __iters__ "%s" > /dev/null' % (filename, word) cmd2 = './a.out "%s" sse4 __iters__ "%s" > /dev/null' % (filename, word) _, t1, t2 = time(cmd1, cmd2) return "[%d,%d] libc=%0.3fs sse4=%0.3fs speedup=%0.2f" % (wordpos, wordlen, t1, t2, t1/t2) logname = "sse4.log" lognumber = 1 while True: if not os.path.exists(logname): log = open(logname, "w") break else: logname = "sse4%d.log" % lognumber lognumber += 1 try: for n in xrange(4, 64): i1 = random.randint( 0, 64) i2 = random.randint( 65, 1024) i3 = random.randint(1024, len(string)-n) print "length", n for i in [i1, i2, i3]: word = string[i:i+n] for c in "\\`()<>{}\"": word = word.replace(c, "\\" + c) cmd = './a.out "%s" verify 1 "%s"' % (filename, word) err = os.system(cmd) if err: print repr(string[i:i+l]) sys.exit(1) else: s = compare(filename, i, word, n) log.write(s + "\n") print s except: import traceback traceback.print_exc() log.close() ================================================ FILE: original/sse4_strstr.c ================================================ /* SSE4 string search --- modification of Karp-Rabin algorithm, $Revision: 1.11 $ Acceleration of strstr using SSE4 instruction MPSADBW. This program includes one wrapper sse4_strstr around following functions: * sse4_strstr_any - exact comparison is done with built-in function strncmp.c * sse4_strstr_len3, see4_strstr_len4 - optimized for substring of length 3 and 4 chars, no additional comparison is needed * sse4_strstr_max20, sse4_strstr_max36 - optimized for substring of length 4..20 and 20..36, exact comparision is done with few assebler instructions Author: Wojciech Mua e-mail: wojciech_mula@poczta.onet.pl www: http://0x80.pl/ License: BSD initial release 27-05-2008, last update $Date: 2008-06-08 23:00:44 $ */ #include #include #include #include #include static uint8_t mask[][16] = { {0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}, {0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}, {0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}, {0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}, {0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}, {0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}, {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}, {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00}, {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00}, {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00}, {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00}, {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00}, {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00}, {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00}, {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00}, {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff}, }; char* sse4_strstr_any(char* s1, int n1, char* s2, int n2); char* sse4_strstr_len3(char* s1, int n1, char* s2, int n2); char* sse4_strstr_len4(char* s1, int n1, char* s2, int n2); char* sse4_strstr_max20(char* s1, int n1, char* s2, int n2); char* sse4_strstr_max36(char* s1, int n1, char* s2, int n2); char* sse4_strstr(char* s1, int n1, char* s2, int n2) { switch (n1) { case 0: return NULL; case 1: return strchr(s2, s1[1]); case 2: return strstr(s2, s1); case 3: return sse4_strstr_len3(s1, n1, s2, n2); case 4: return sse4_strstr_len4(s1, n1, s2, n2); case 5: case 6: case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: case 15: case 16: case 17: case 18: case 19: case 20: /* 5..20 */ return sse4_strstr_max20(s1, n1, s2, n2); case 21: case 22: case 23: case 24: case 25: case 26: case 27: case 28: case 29: case 30: case 31: case 32: case 33: case 34: case 35: case 36: /* 21..36 */ return sse4_strstr_max36(s1, n1, s2, n2); default: return sse4_strstr_any(s1, n1, s2, n2); } } char* sse4_strstr_any(char* s1, int n1, char* s2, int n2) { // n1 > 4, n2 > 4 char* result; uint32_t dummy __attribute__((unused)); __asm__ volatile ("movdqu (%%eax), %%xmm1" : : "a" (s1)); __asm__ volatile ("pxor %%xmm0, %%xmm0" : : ); __asm__ volatile ( /*** initialization ****************************************************/ // we have to save 3 registers: eax, ecx and edx // also strncmp needs three arguments, thus esp -= (3+3)*4 = " addl $-24, %%esp \n" // function strncmp is invoke with argument s1+4, s2+4, n1-4 -- s1+4 and // n1-4 are constant across all iterations, thus stack frame // can be partially initialize: " movl 8(%%ebp), %%eax \n" " addl $4, %%eax \n" " movl %%eax, 0(%%esp) \n" // s1+4 " \n" " movl 12(%%ebp), %%eax \n" " subl $4, %%eax \n" " movl %%eax, 8(%%esp) \n" // n1-4 " \n" /*** main loop *********************************************************/ "0: \n" // load 16 bytes, we consider just 8+3 chars at the beggining " movdqu (%%esi), %%xmm2 \n" " addl $8, %%esi \n" // advance pointer: s1 += 8 // xmm2 - vector of L1 distances between s1's 4-byte prefix // and sequence of eight 4-byte subvectors from xmm2 " mpsadbw $0, %%xmm1, %%xmm2 \n" // xmm2 - word become 0xffff if L1=0, 0x0000 otherwise " pcmpeqw %%xmm0, %%xmm2 \n" // any L1=0? if no, skip comparision inner loop " ptest %%xmm2, %%xmm0 \n" " jc 1f \n" /*** inner loop ************************************************/ // comparision inner loop: convert word mask to bitmask " pmovmskb %%xmm2, %%edx \n" // we are interested in **word** indexes " andl $0b0101010101010101, %%edx \n" " 2: \n" " bsf %%edx, %%eax \n" // get next bit position " jz 1f \n" // no bit set? exit loop " \n" " btr %%eax, %%edx \n" // unset bit " shr $1, %%eax \n" // divide position by 2 // save registers before invoke strncmp " movl %%eax, 12(%%esp) \n" " movl %%ecx, 16(%%esp) \n" " movl %%edx, 20(%%esp) \n" // update function argument " leal -4(%%esi, %%eax), %%eax \n" " movl %%eax, 4(%%esp) \n" // s2+4 // invoke strncmp(s1+4, s2+4, n1-4) " call strncmp \n" " test %%eax, %%eax \n" // result == 0? // restore registers " movl 12(%%esp), %%eax \n" " movl 16(%%esp), %%ecx \n" " movl 20(%%esp), %%edx \n" " jnz 2b \n" " leal -8(%%eax, %%esi), %%eax \n" // eax -- address " jmp 4f \n" // of s1's first occurance /*** main loop prologue ************************************************/ "1: \n" " subl $8, %%ecx \n" " cmpl $0, %%ecx \n" " jg 0b \n" " xorl %%eax, %%eax \n" // s1 not found, return NULL "4: \n" " addl $24, %%esp \n" // and finally restore stack frame : "=a" (result), "=S" (dummy), "=c" (dummy) : "S" (s2), "c" (n2-n1) ); return result; } char* sse4_strstr_max20(char* s1, int n1, char* s2, int n2) { // 4 <= n1 <= 20, n2 > 4 uint32_t dummy __attribute__((unused)); char* result; __asm__ volatile ("movdqu (%%eax), %%xmm6" : : "a" (mask[n1-5])); __asm__ volatile ("movdqu (%%eax), %%xmm1" : : "a" (s1)); __asm__ volatile ("movdqu (%%eax), %%xmm2" : : "a" (s1+4)); // xmm2 -- s1 suffix __asm__ volatile ("pxor %%xmm0, %%xmm0" : : ); __asm__ volatile ( /*** main loop *********************************************************/ "0: \n" // load 16 bytes, MPSADBW consider just 8+3 chars at the beggining " movdqu (%%esi), %%xmm7 \n" " addl $8, %%esi \n" // advance pointer: s1 += 8 // xmm2 - vector of L1 distances between s1's 4-byte prefix // and sequence of eight 4-byte subvectors from xmm2 " mpsadbw $0, %%xmm1, %%xmm7 \n" // xmm2 - word become 0xffff if L1=0, 0x0000 otherwise " pcmpeqw %%xmm0, %%xmm7 \n" // any L1=0? if no, skip comparision inner loop " ptest %%xmm7, %%xmm0 \n" " jc 1f \n" /*** inner loop ************************************************/ // comparision inner loop: convert word mask to bitmask " pmovmskb %%xmm7, %%edx \n" // we are interested in **word** positions " andl $0b0101010101010101, %%edx \n" " 2: \n" " bsf %%edx, %%eax \n" // get next bit position " jz 1f \n" // no bit set? exit loop " \n" " btr %%eax, %%edx \n" // unset bit " shr $1, %%eax \n" // divide position by 2 " movdqu -4(%%esi, %%eax), %%xmm7 \n" " pcmpeqb %%xmm2, %%xmm7 \n" " ptest %%xmm6, %%xmm7 \n" " jnc 2b \n" " leal -8(%%eax, %%esi), %%eax \n" // eax -- address " jmp 4f \n" // of s1's first occurance /*** main loop prologue ************************************************/ "1: \n" " subl $8, %%ecx \n" " cmpl $0, %%ecx \n" " jg 0b \n" " xorl %%eax, %%eax \n" // s1 not found, return NULL "4: \n" : "=a" (result), "=S" (dummy), "=c" (dummy) : "S" (s2), "c" (n2-n1) ); return result; } char* sse4_strstr_max36(char* s1, int n1, char* s2, int n2) { // 20 <= n1 <= 36, n2 > 4 uint32_t dummy __attribute__((unused)); char* result; __asm__ volatile ("movdqu (%%eax), %%xmm1" : : "a" (s1)); __asm__ volatile ("movdqu (%%eax), %%xmm2" : : "a" (s1+4)); // xmm2 - s1[4:20] __asm__ volatile ("movdqu (%%eax), %%xmm3" : : "a" (s1+4+16)); // xmm3 - s1[20:] (suffix) __asm__ volatile ("movdqu (%%eax), %%xmm6" : : "a" (mask[n1-5-16])); __asm__ volatile ("pand %%xmm6, %%xmm3" : : ); __asm__ volatile ("pxor %%xmm0, %%xmm0" : : ); // packed_byte(0x00) __asm__ volatile ("pcmpeqb %%xmm5, %%xmm5" : : ); // packed_byte(0xff) __asm__ volatile ( /*** main loop *********************************************************/ "0: \n" // load 16 bytes, MPSADBW consider just 8+3 chars at the beggining " movdqu (%%esi), %%xmm7 \n" " addl $8, %%esi \n" // advance pointer: s1 += 8 // xmm2 - vector of L1 distances between s1's 4-byte prefix // and sequence of eight 4-byte subvectors from xmm2 " mpsadbw $0, %%xmm1, %%xmm7 \n" // xmm2 - word become 0xffff if L1=0, 0x0000 otherwise " pcmpeqw %%xmm0, %%xmm7 \n" // any L1=0? if no, skip comparision inner loop " ptest %%xmm7, %%xmm0 \n" " jc 1f \n" /*** inner loop ************************************************/ // comparision inner loop: convert word mask to bitmask " pmovmskb %%xmm7, %%edx \n" // we are interested in **word** positions " andl $0b0101010101010101, %%edx \n" " 2: \n" " bsf %%edx, %%eax \n" // get next bit position " jz 1f \n" // no bit set? exit loop " \n" " btr %%eax, %%edx \n" // unset bit " shr $1, %%eax \n" // divide position by 2 " movdqu -4(%%esi, %%eax), %%xmm7 \n" " movdqu 12(%%esi, %%eax), %%xmm4 \n" " pand %%xmm6, %%xmm4 \n" " pcmpeqb %%xmm2, %%xmm7 \n" " pcmpeqb %%xmm3, %%xmm4 \n" " pand %%xmm7, %%xmm4 \n" " ptest %%xmm5, %%xmm4 \n" " jnc 2b \n" " leal -8(%%eax, %%esi), %%eax \n" // eax -- address " jmp 4f \n" // of s1's first occurance /*** main loop prologue ************************************************/ "1: \n" " subl $8, %%ecx \n" " cmpl $0, %%ecx \n" " jg 0b \n" " xorl %%eax, %%eax \n" // s1 not found, return NULL "4: \n" : "=a" (result), "=S" (dummy), "=c" (dummy) : "S" (s2), "c" (n2-n1) ); return result; } char* sse4_strstr_len4(char* s1, int n1, char* s2, int n2) { // n1 == 4, n2 > 4 uint32_t dummy __attribute__((unused)); char* result; __asm__ volatile ("movdqu (%%eax), %%xmm1" : : "a" (s1)); __asm__ volatile ("pxor %%xmm0, %%xmm0" : : ); __asm__ volatile ( /*** main loop *********************************************************/ "0: \n" // load 16 bytes, we consider just 8+3 chars at the beggining " movdqu (%%esi), %%xmm2 \n" " addl $8, %%esi \n" // advance pointer: s1 += 8 // xmm2 - vector of L1 distances between s1's 4-byte prefix // and sequence of eight 4-byte subvectors from xmm2 " mpsadbw $0, %%xmm1, %%xmm2 \n" // xmm2 - word become 0xffff if L1=0, 0x0000 otherwise " pcmpeqw %%xmm0, %%xmm2 \n" // any L1=0? if no, skip comparision inner loop " ptest %%xmm2, %%xmm0 \n" " jnc 1f \n" " subl $8, %%ecx \n" " cmpl $0, %%ecx \n" " jg 0b \n" " xorl %%eax, %%eax \n" // s1 not found, return NULL " jmp 2f \n" "1: \n" " pmovmskb %%xmm2, %%eax \n" " bsfl %%eax, %%eax \n" " shrl $1, %%eax \n" " lea -8(%%esi, %%eax), %%eax \n" "2: \n" : "=a" (result), "=S" (dummy), "=c" (dummy) : "S" (s2), "c" (n2-n1) ); return result; } char* sse4_strstr_len3(char* s1, int n1, char* s2, int n2) { // n1 == 4, n2 > 4 uint32_t dummy __attribute__((unused)); char* result; __asm__ volatile ("movdqu (%%eax), %%xmm1" : : "a" (s1)); __asm__ volatile ("pxor %%xmm0, %%xmm0" : : ); __asm__ volatile ( /*** main loop *********************************************************/ "0: \n" // load 16 bytes, we consider just 8+3 chars at the beggining " movdqu (%%esi), %%xmm2 \n" " addl $8, %%esi \n" // advance pointer: s1 += 8 " movdqa %%xmm2, %%xmm3 \n" " psrldq $3, %%xmm3 \n" " pmovzxbw %%xmm3, %%xmm3 \n" // xmm2 - vector of L1 distances between s1's 4-byte prefix // and sequence of eight 4-byte subvectors from xmm2 " mpsadbw $0, %%xmm1, %%xmm2 \n" " psubw %%xmm3, %%xmm2 \n" // xmm2 - word become 0xffff if L1=0, 0x0000 otherwise " pcmpeqw %%xmm0, %%xmm2 \n" // any L1=0? if no, skip comparision inner loop " ptest %%xmm2, %%xmm0 \n" " jnc 1f \n" " subl $8, %%ecx \n" " cmpl $0, %%ecx \n" " jg 0b \n" " xorl %%eax, %%eax \n" // s1 not found, return NULL " jmp 2f \n" "1: \n" " pmovmskb %%xmm2, %%eax \n" " bsfl %%eax, %%eax \n" " shrl $1, %%eax \n" " lea -8(%%esi, %%eax), %%eax \n" "2: \n" : "=a" (result), "=S" (dummy), "=c" (dummy) : "S" (s2), "c" (n2-n1) ); return result; } // sample uint8_t buffer[1024*500 + 1]; void help() { puts("prog file sse4|libc|verify iter-count string"); puts("* iter-count > 0"); exit(1); } int main(int argc, char* argv[]) { FILE* f; int i; int size; if (argc != 5) help(); f = fopen(argv[1], "r"); if (!f) { printf("can't open '%s'\n", argv[1]); return 2; } size = fread(buffer, 1, sizeof(buffer), f); buffer[size] = 0; fclose(f); int fun = -1, iters, n1; char* s1; if (strcasecmp("sse4", argv[2]) == 0) fun = 0; else if (strcasecmp("libc", argv[2]) == 0) fun = 1; else if (strcasecmp("verify", argv[2]) == 0) fun = 2; else help(); if (atoi(argv[3]) <= 0 && (fun != 2)) help(); else iters = atoi(argv[3]); s1 = argv[4]; n1 = strlen(s1); if ((n1 < 3)) help(); else printf("s1(%d)='%s' s2(%d)\n", n1, s1, size); char* r1; char* r2; switch (fun) { case 0: puts("SSE4"); for (i=0; i < iters; i++) sse4_strstr(s1, n1, (char*)buffer, size); break; case 1: puts("Lib C"); for (i=0; i < iters; i++) { //(unsigned int)strstr((char*)buffer, s1); __asm__ volatile ( "movl $buffer, (%%esp)\n" "movl %0, 4(%%esp)\n" "call strstr\n" : : "r" (s1) : "eax", "ecx", "edx" ); } break; case 2: puts("verify"); r1 = strstr((char*)buffer, s1); r2 = sse4_strstr(s1, n1, (char*)buffer, size); printf("LibC = %u\n", (unsigned int)r1); printf("SSE4 = %u %s\n", (unsigned int)r2, (r1 != r2) ? "FAILED!!!" : "ok" ); if (r1 != r2) return 1; } return 0; } // eof ================================================ FILE: results/armv7-32bit-gcc4.9.2.txt ================================================ ./speedup_arm data/i386.txt data/words 1 std::strstr ... reference result = 810807651, time = 7.318775 s std::string::find ... reference result = 810807651, time = 4.171311 s SWAR 32-bit (generic) ... reference result = 810807651, time = 2.450585 s ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.299383 s ./speedup_arm data/i386.txt data/words 1 std::strstr ... reference result = 810807651, time = 7.329223 s std::string::find ... reference result = 810807651, time = 4.188313 s SWAR 32-bit (generic) ... reference result = 810807651, time = 2.461333 s ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.305622 s ./speedup_arm data/i386.txt data/words 1 std::strstr ... reference result = 810807651, time = 7.304049 s std::string::find ... reference result = 810807651, time = 4.172608 s SWAR 32-bit (generic) ... reference result = 810807651, time = 2.451913 s ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.300619 s ./speedup_arm data/i386.txt data/words 1 std::strstr ... reference result = 810807651, time = 7.307621 s std::string::find ... reference result = 810807651, time = 4.176439 s SWAR 32-bit (generic) ... reference result = 810807651, time = 2.451030 s ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.299274 s ./speedup_arm data/i386.txt data/words 1 std::strstr ... reference result = 810807651, time = 7.313498 s std::string::find ... reference result = 810807651, time = 4.175714 s SWAR 32-bit (generic) ... reference result = 810807651, time = 2.451439 s ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.298613 s ================================================ FILE: results/armv8-64bit-clang3.8.0.txt ================================================ std::strstr ... reference result = 810807651, time = 3.457578 s std::string::find ... reference result = 810807651, time = 1.821379 s SWAR 64-bit (generic) ... reference result = 810807651, time = 0.463006 s SWAR 32-bit (generic) ... reference result = 810807651, time = 0.810749 s ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.407214 s AArch64 64 bit (v2) ... reference result = 810807651, time = 0.279203 s std::strstr ... reference result = 810807651, time = 3.381364 s std::string::find ... reference result = 810807651, time = 1.813678 s SWAR 64-bit (generic) ... reference result = 810807651, time = 0.462694 s SWAR 32-bit (generic) ... reference result = 810807651, time = 0.810882 s ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.406888 s AArch64 64 bit (v2) ... reference result = 810807651, time = 0.278970 s std::strstr ... reference result = 810807651, time = 4.118293 s std::string::find ... reference result = 810807651, time = 1.822696 s SWAR 64-bit (generic) ... reference result = 810807651, time = 0.463028 s SWAR 32-bit (generic) ... reference result = 810807651, time = 0.810933 s ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.407296 s AArch64 64 bit (v2) ... reference result = 810807651, time = 0.279606 s std::strstr ... reference result = 810807651, time = 3.375462 s std::string::find ... reference result = 810807651, time = 1.821449 s SWAR 64-bit (generic) ... reference result = 810807651, time = 0.462863 s SWAR 32-bit (generic) ... reference result = 810807651, time = 0.811320 s ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.407274 s AArch64 64 bit (v2) ... reference result = 810807651, time = 0.279285 s std::strstr ... reference result = 810807651, time = 3.378566 s std::string::find ... reference result = 810807651, time = 1.825054 s SWAR 64-bit (generic) ... reference result = 810807651, time = 0.462957 s SWAR 32-bit (generic) ... reference result = 810807651, time = 0.811188 s ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.407364 s AArch64 64 bit (v2) ... reference result = 810807651, time = 0.279490 s ================================================ FILE: results/bulldozer-fx-8510-gcc4.8.4-sse.txt ================================================ ./speedup data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 9.390892 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.938355 s SSE2 (generic) ... reference result = 8108076510, time = 0.788781 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.989833 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 2.060081 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 2.006810 s ./speedup data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 9.387153 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.948608 s SSE2 (generic) ... reference result = 8108076510, time = 0.789325 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.988635 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 2.066327 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 2.007233 s ./speedup data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 9.377923 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.967027 s SSE2 (generic) ... reference result = 8108076510, time = 0.788709 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.989077 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 2.065608 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 2.007228 s ================================================ FILE: results/cascadelake-Gold-5217-gcc-7.4.0-avx512bw.txt ================================================ ./speedup_avx512bw data/i386.txt data/words scalar (naive) ... reference result = 8108076510, time = 4.095307 s std::strstr ... reference result = 8108076510, time = 0.492459 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.243510 s SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.349437 s SSE2 (generic) ... reference result = 8108076510, time = 0.443313 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.583372 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.822263 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.311350 s SSE (naive) ... reference result = 8108076510, time = 1.757493 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.531920 s AVX2 (generic) ... reference result = 8108076510, time = 0.338738 s AVX2 (naive) ... reference result = 8108076510, time = 1.013489 s AVX2-wide (naive) ... reference result = 8107771150, time = 0.480182 s AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.634909 s AVX512F (generic) ... reference result = 8108076510, time = 0.281276 s AVX512BW (generic) ... reference result = 8108076510, time = 0.256798 s ./speedup_avx512bw data/i386.txt data/words scalar (naive) ... reference result = 8108076510, time = 4.089051 s std::strstr ... reference result = 8108076510, time = 0.492275 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.243637 s SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.343078 s SSE2 (generic) ... reference result = 8108076510, time = 0.443659 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.584467 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.822993 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.313485 s SSE (naive) ... reference result = 8108076510, time = 1.760697 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.531827 s AVX2 (generic) ... reference result = 8108076510, time = 0.338912 s AVX2 (naive) ... reference result = 8108076510, time = 1.012637 s AVX2-wide (naive) ... reference result = 8107771150, time = 0.478455 s AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.636537 s AVX512F (generic) ... reference result = 8108076510, time = 0.279054 s AVX512BW (generic) ... reference result = 8108076510, time = 0.255777 s ./speedup_avx512bw data/i386.txt data/words scalar (naive) ... reference result = 8108076510, time = 4.092489 s std::strstr ... reference result = 8108076510, time = 0.489993 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.241418 s SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.346954 s SSE2 (generic) ... reference result = 8108076510, time = 0.442109 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.583955 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.822657 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.312243 s SSE (naive) ... reference result = 8108076510, time = 1.757719 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.532528 s AVX2 (generic) ... reference result = 8108076510, time = 0.338666 s AVX2 (naive) ... reference result = 8108076510, time = 1.013151 s AVX2-wide (naive) ... reference result = 8107771150, time = 0.477202 s AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.634753 s AVX512F (generic) ... reference result = 8108076510, time = 0.280525 s AVX512BW (generic) ... reference result = 8108076510, time = 0.256838 s ================================================ FILE: results/haswell-i7-4770-gcc5.4.1-avx2.txt ================================================ ./speedup_avx2 data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 0.528137 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.605520 s SSE2 (generic) ... reference result = 8108076510, time = 0.554532 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.897859 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.996473 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.559956 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.615836 s AVX2 (generic) ... reference result = 8108076510, time = 0.386747 s ./speedup_avx2 data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 0.527864 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.577149 s SSE2 (generic) ... reference result = 8108076510, time = 0.554352 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.897752 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.996771 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.560012 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.615825 s AVX2 (generic) ... reference result = 8108076510, time = 0.386528 s ./speedup_avx2 data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 0.528205 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.591732 s SSE2 (generic) ... reference result = 8108076510, time = 0.554423 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.897921 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.996889 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.559919 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.615783 s AVX2 (generic) ... reference result = 8108076510, time = 0.386609 s ================================================ FILE: results/knights-landing-7210-gcc5.3.0-avx512f.txt ================================================ ./speedup_avx512 data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 4.964439 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 8.205818 s SSE2 (generic) ... reference result = 8108076510, time = 6.126381 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 18.737857 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 13.745691 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 6.306659 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 13.179747 s AVX2 (generic) ... reference result = 8108076510, time = 4.113571 s AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 2.348848 s AVX512F (generic) ... reference result = 8108076510, time = 1.164081 s ./speedup_avx512 data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 4.946063 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 8.172884 s SSE2 (generic) ... reference result = 8108076510, time = 6.107860 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 18.717146 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 13.724856 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 6.288685 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 13.151361 s AVX2 (generic) ... reference result = 8108076510, time = 4.094781 s AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 2.327864 s AVX512F (generic) ... reference result = 8108076510, time = 1.142747 s ./speedup_avx512 data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 4.949234 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 8.170751 s SSE2 (generic) ... reference result = 8108076510, time = 6.109035 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 18.716665 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 13.727568 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 6.289994 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 13.153943 s AVX2 (generic) ... reference result = 8108076510, time = 4.094941 s AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 2.326156 s AVX512F (generic) ... reference result = 8108076510, time = 1.140567 s ================================================ FILE: results/postprocess.py ================================================ from collections import OrderedDict def load(file): D = OrderedDict() for line in file: if 'reference result' not in line: continue name, tail = line.split('...') name = name.strip() time = float(tail.split()[6]) if name not in D: D[name] = time else: D[name] = min(time, D[name]) return D def main(): import sys paths = sys.argv[1:] for path in paths: if len(paths) > 1: print path with open(path, 'rt') as f: for name, time in load(f).iteritems(): print '%-30s %10.5f' % (name, time) if __name__ == '__main__': main() ================================================ FILE: results/skylake-i7-6700-gcc5.4.1-avx2.txt ================================================ ./speedup_avx2 data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 0.662049 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.404260 s SSE2 (generic) ... reference result = 8108076510, time = 0.489281 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.638782 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.879433 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.390802 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.570455 s AVX2 (generic) ... reference result = 8108076510, time = 0.363694 s ./speedup_avx2 data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 0.662266 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.404036 s SSE2 (generic) ... reference result = 8108076510, time = 0.489313 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.638926 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.879193 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.390626 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.569980 s AVX2 (generic) ... reference result = 8108076510, time = 0.363876 s ./speedup_avx2 data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 0.661478 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.405280 s SSE2 (generic) ... reference result = 8108076510, time = 0.488631 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.638753 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.879345 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.390670 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.569808 s AVX2 (generic) ... reference result = 8108076510, time = 0.363091 s ================================================ FILE: results/skylake-i9-7900-gcc-5.4.1-avx512bw.txt ================================================ ./speedup_avx512bw data/i386.txt data/words naive scalar ... reference result = 8108076510, time = 4.872957 s std::strstr ... reference result = 8108076510, time = 0.401080 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.237922 s SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.044511 s SSE2 (generic) ... reference result = 8108076510, time = 0.385573 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.580510 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.674341 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.058753 s SSE (naive) ... reference result = 8108076510, time = 1.709206 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.444774 s AVX2 (generic) ... reference result = 8108076510, time = 0.274761 s AVX2 (naive) ... reference result = 8108076510, time = 0.918683 s AVX2 (naive unrolled) ... reference result = 8108076510, time = 0.463246 s AVX2-wide (naive) ... reference result = 8107771150, time = 0.441233 s AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.507046 s AVX512F (generic) ... reference result = 8108076510, time = 0.262774 s AVX512BW (generic) ... reference result = 8108076510, time = 0.220457 s ./speedup_avx512bw data/i386.txt data/words naive scalar ... reference result = 8108076510, time = 4.816247 s std::strstr ... reference result = 8108076510, time = 0.398468 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.239442 s SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.050195 s SSE2 (generic) ... reference result = 8108076510, time = 0.384561 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.582862 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.675480 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.061598 s SSE (naive) ... reference result = 8108076510, time = 1.676643 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.439711 s AVX2 (generic) ... reference result = 8108076510, time = 1.638515 s AVX2 (naive) ... reference result = 8108076510, time = 0.984768 s AVX2 (naive unrolled) ... reference result = 8108076510, time = 0.494318 s AVX2-wide (naive) ... reference result = 8107771150, time = 0.479306 s AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.553042 s AVX512F (generic) ... reference result = 8108076510, time = 0.290909 s AVX512BW (generic) ... reference result = 8108076510, time = 0.237055 s ./speedup_avx512bw data/i386.txt data/words naive scalar ... reference result = 8108076510, time = 6.406914 s std::strstr ... reference result = 8108076510, time = 0.401352 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.237499 s SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.043457 s SSE2 (generic) ... reference result = 8108076510, time = 0.385167 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.581361 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.675044 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.059933 s SSE (naive) ... reference result = 8108076510, time = 1.671910 s AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.444940 s AVX2 (generic) ... reference result = 8108076510, time = 0.276522 s AVX2 (naive) ... reference result = 8108076510, time = 0.921444 s AVX2 (naive unrolled) ... reference result = 8108076510, time = 0.464818 s AVX2-wide (naive) ... reference result = 8107771150, time = 0.442211 s AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.511326 s AVX512F (generic) ... reference result = 8108076510, time = 0.265488 s AVX512BW (generic) ... reference result = 8108076510, time = 0.221329 s ================================================ FILE: results/westmere-m540-gcc6.2.0-sse4.txt ================================================ ./speedup data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 0.832291 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.498591 s SSE2 (generic) ... reference result = 8108076510, time = 0.745890 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.450405 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 1.238676 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.699681 s ./speedup data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 0.822457 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.518604 s SSE2 (generic) ... reference result = 8108076510, time = 0.750936 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.470000 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 1.239929 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.707411 s ./speedup data/i386.txt data/words std::strstr ... reference result = 8108076510, time = 0.827280 s SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.535406 s SSE2 (generic) ... reference result = 8108076510, time = 0.747252 s SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.456153 s SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 1.238485 s SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.711734 s ================================================ FILE: scalar.cpp ================================================ // Implementation by Daniel Lemire // https://github.com/WojciechMula/sse4-strstr/issues/2 size_t strstr_naive(const char * hay, size_t size, const char *needle, size_t needlesize) { if (size == needlesize) { return memcmp(hay, needle, size) == 0 ? 0 : std::string::npos; } const char first = needle[0]; const ssize_t maxpos = ssize_t(size) - ssize_t(needlesize) + 1; for(ssize_t i = 0; i < maxpos; i++) { if(hay[i] != first) { i++; while( i < maxpos && hay[i] != first ) i++; if ( i == maxpos ) break; } size_t j = 1; for( ; j < needlesize; ++j) if(hay[ i + j ] != needle[ j ] ) break; if( j == needlesize) return i; } return std::string::npos; } ================================================ FILE: src/all.h ================================================ #pragma once #include "common.h" #include #include #include "fixed-memcmp.cpp" #include "scalar.cpp" #include "swar64-strstr-v2.cpp" #include "swar32-strstr-v2.cpp" #ifdef HAVE_SSE_INSTRUCTIONS # include # include "sse4-strstr.cpp" # include "sse4-strstr-unrolled.cpp" # include "sse4.2-strstr.cpp" # include "sse2-strstr.cpp" # include "sse-naive-strstr.cpp" # include "sse2-needle4.cpp" #endif #ifdef HAVE_AVX2_INSTRUCTIONS # include # include "avx2-strstr.cpp" # include "avx2-strstr-v2.cpp" # include "avx2-naive-strstr.cpp" # include "avx2-naive-strstr64.cpp" # include "avx2-naive-unrolled-strstr.cpp" #endif #ifdef HAVE_AVX512F_INSTRUCTIONS # include "avx512f-strstr.cpp" # include "avx512f-strstr-v2.cpp" #endif #ifdef HAVE_AVX512BW_INSTRUCTIONS # include "avx512bw-strstr-v2.cpp" # include "avx512bw-strstr-v3.cpp" #endif #ifdef HAVE_NEON_INSTRUCTIONS # include # include "neon-strstr-v2.cpp" #endif #ifdef HAVE_AARCH64_ARCHITECTURE # include "aarch64-strstr-v2.cpp" #endif ================================================ FILE: src/all_procedures.cpp ================================================ #include "all.h" #include #include #include #include using str_find_fun = size_t (*)(const char*, size_t, const char*, size_t); struct Procedures { struct Item { str_find_fun proc; std::string name; char code; bool builtin; Item(str_find_fun proc_, const char* name_, char code_, bool builtin_ = false) : proc(proc_) , name(name_) , code(code_) , builtin(builtin_) {} }; std::vector procedures; const Item& operator[](char code) { auto pred = [code](const Item& item){return item.code == code;}; auto it = std::find_if(procedures.begin(), procedures.end(), pred); if (it == procedures.end()) { throw std::logic_error("can't find procedure with code '" + std::string(1, code) + "'"); } return *it; } }; size_t strstr_libc(const char* s, size_t, const char* needle, size_t) { const char* ptr = strstr(s, needle); if (ptr) { return ptr - s; } else { return std::string::npos; } } Procedures all_procedures() { Procedures db; db.procedures.emplace_back( strstr_naive, "scalar (naive)", 'a' ); db.procedures.emplace_back( strstr_libc, "std::strstr", 'b', true ); db.procedures.emplace_back( nullptr, "std::string::find", 'c', true ); #define REGISTER(code, name, procedure) \ { \ str_find_fun f = procedure; \ db.procedures.emplace_back(f, name, code); \ } REGISTER('d', "SWAR 64-bit (generic)", swar64_strstr_v2); REGISTER('e', "SWAR 32-bit (generic)", swar32_strstr_v2); #ifdef HAVE_SSE_INSTRUCTIONS REGISTER('f', "SSE2 (generic)", sse2_strstr_v2); REGISTER('g', "SSE4.1 (MPSADBW)", sse4_strstr); REGISTER('h', "SSE4.1 (MPSADBW unrolled)", sse4_strstr_unrolled); REGISTER('i', "SSE4.2 (PCMPESTRM)", sse42_strstr); REGISTER('j', "SSE (naive)", sse_naive_strstr); REGISTER('v', "SSE2 (4-byte needle)", sse2_strstr_needle4); REGISTER('w', "SSE2 (4-byte needle v2)", sse2_strstr_needle4_v2); #endif #ifdef HAVE_AVX2_INSTRUCTIONS REGISTER('k', "AVX2 (MPSADBW)", avx2_strstr); REGISTER('l', "AVX2 (generic)", avx2_strstr_v2); REGISTER('m', "AVX2 (naive)", avx2_naive_strstr); REGISTER('n', "AVX2 (naive unrolled)", avx2_naive_unrolled_strstr); REGISTER('o', "AVX2-wide (naive)", avx2_naive_strstr64); #endif #ifdef HAVE_AVX512F_INSTRUCTIONS REGISTER('p', "AVX512F (MPSADBW-like)", avx512f_strstr); REGISTER('q', "AVX512F (generic)", avx512f_strstr_v2); #endif #ifdef HAVE_AVX512BW_INSTRUCTIONS REGISTER('r', "AVX512BW (generic)", avx512bw_strstr_v2); REGISTER('s', "AVX512BW (masked)", avx512bw_strstr_v3); #endif #ifdef HAVE_NEON_INSTRUCTIONS REGISTER('t', "ARM Neon 32 bit (v2)", neon_strstr_v2); #endif #ifdef HAVE_AARCH64_ARCHITECTURE REGISTER('u', "AArch64 64 bit (v2)", aarch64_strstr_v2); #endif #undef REGISTER return db; } ================================================ FILE: src/application_base.cpp ================================================ class ApplicationBase { protected: std::string file; std::vector words; public: class Error final { public: const std::string message; public: Error(const std::string& msg) : message(msg) {} }; public: void prepare(const std::string& file_name, const std::string& words_name) { load_text(file_name); load_words(words_name); } private: void load_text(const std::string& path) { FILE* f = fopen(path.c_str(), "rt"); if (f == nullptr) { throw_errno(path); } fseek(f, -1, SEEK_END); const auto size = ftell(f); rewind(f); char* buffer = new char[size]; fread(buffer, size, 1, f); buffer[size] = 0; fclose(f); file = buffer; delete[] buffer; } void load_words(const std::string& path) { char buffer[1024]; FILE* f = fopen(path.c_str(), "rt"); if (f == nullptr) { throw_errno(path); } while (!feof(f)) { fgets(buffer, sizeof(buffer), f); const auto len = strlen(buffer); if (buffer[len - 1] == '\n') { buffer[len - 1] = 0; if (len == 1) // skip empty strings continue; } words.push_back(buffer); } fclose(f); } void throw_errno(const std::string& prefix) { const std::string msg = prefix + ": " + std::string(strerror(errno)); throw Error(msg); } }; ================================================ FILE: src/benchmark.cpp ================================================ #include #include #include #include #include #include #include "all_procedures.cpp" // ------------------------------------------------------------------------ #include #include "benchmark.h" #include "application_base.cpp" class Application final: public ApplicationBase { Procedures db; public: enum class TestType { OptimisticCase, Random, WorstCase }; struct Parameters { size_t needle_position; size_t needle_size; size_t count; TestType test_type; std::string procedure_codes; }; public: Application(const Parameters& params) : db(all_procedures()) , parameters(params) { prepare(); } bool operator()() { // strstr is treated as built-in function by GCC // it seems it's wiped out in benchmark const bool measure_stdstring = false; #if defined(HAVE_NEON_INSTRUCTIONS) && !defined(HAVE_AARCH64_ARCHITECTURE) // On Raspberry Pi it's terribly slow, but on Aarch64 // the 64-bit procedure is pretty fast const bool measure_swar64 = false; #else const bool measure_swar64 = true; #endif if (is_enabled('a')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return strstr_naive(s.data(), s.size(), neddle.data(), neddle.size()); }; measure(find, 'a'); } if (is_enabled('b')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { const char* res = strstr(s.data(), neddle.data()); if (res != nullptr) { return res - s.data(); } else { return std::string::npos; } }; measure(find, 'b'); } if (measure_stdstring && is_enabled('c')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return s.find(neddle); }; measure(find, 'c'); } if (measure_swar64 && is_enabled('d')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return swar64_strstr_v2(s, neddle); }; measure(find, 'd'); } if (is_enabled('e')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return swar32_strstr_v2(s, neddle); }; measure(find, 'e'); } #ifdef HAVE_SSE_INSTRUCTIONS if (is_enabled('f')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return sse2_strstr_v2(s, neddle); }; measure(find, 'f'); } if (is_enabled('g')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return sse4_strstr(s, neddle); }; measure(find, 'g'); } if (is_enabled('h')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return sse4_strstr_unrolled(s, neddle); }; measure(find, 'h'); } if (is_enabled('i')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return sse42_strstr(s, neddle); }; measure(find, 'i'); } if (is_enabled('j')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return sse_naive_strstr(s, neddle); }; measure(find, 'j'); } if (is_enabled('v')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return sse2_strstr_needle4(s, neddle); }; measure(find, 'v'); } if (is_enabled('w')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return sse2_strstr_needle4_v2(s, neddle); }; measure(find, 'w'); } #endif #ifdef HAVE_AVX2_INSTRUCTIONS if (is_enabled('k')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx2_strstr(s, neddle); }; measure(find, 'k'); } if (is_enabled('l')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx2_strstr_v2(s, neddle); }; measure(find, 'l'); } if (is_enabled('m')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx2_naive_strstr(s, neddle); }; measure(find, 'm'); } if (is_enabled('n')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx2_naive_unrolled_strstr(s, neddle); }; measure(find, 'n'); } if (is_enabled('o')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx2_naive_strstr64(s, neddle); }; measure(find, 'o'); } #endif #ifdef HAVE_AVX512F_INSTRUCTIONS if (is_enabled('p')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx512f_strstr(s, neddle); }; measure(find, 'p'); } if (is_enabled('q')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx512f_strstr_v2(s, neddle); }; measure(find, 'q'); } #endif #ifdef HAVE_AVX512BW_INSTRUCTIONS if (is_enabled('r')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx512bw_strstr_v2(s, neddle); }; measure(find, 'r'); } if (is_enabled('u')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx512bw_strstr_v3(s, neddle); }; measure(find, 'u'); } #endif #ifdef HAVE_NEON_INSTRUCTIONS if (is_enabled('s')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return neon_strstr_v2(s, neddle); }; measure(find, 's'); } #endif #ifdef HAVE_AARCH64_ARCHITECTURE if (is_enabled('t')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return aarch64_strstr_v2(s, neddle); }; measure(find, 't'); } #endif return true; } static void print_help(const char* progname) { std::printf("%s needle-position needle-size iteration-count test-name [procedures]\n", progname); std::puts(""); std::puts("Parameters:"); std::puts(""); std::puts(" needle-position position of the needle"); std::puts(" needle-size length of the needle"); std::puts(" count how many times test is repeated"); std::puts(" test-name one of 'optimistic', 'random', 'worst'"); std::puts(" procedures procedure code(s), listed below [by default all will be tested]"); std::puts(""); std::puts("Test kinds"); std::puts(""); std::puts(" optimistic data before needle contains characters don't present in the needle"); std::puts(" random data before needle contains some random characters"); std::puts(" worst needle has form 'aaa...aaaXaaa...aaa', and data before is filled with the 'a'"); std::puts(""); std::puts("Following procedures are available:"); for (auto& item: all_procedures().procedures) { printf(" [%c] %s\n", item.code, item.name.c_str()); } } private: volatile size_t sink; template void measure(T_FIND find, char code) { BEST_TIME(/**/, sink = find(input, needle), db[code].name.c_str(), parameters.count, parameters.needle_position); } bool is_enabled(char proc) const { return (parameters.procedure_codes.empty()) || (parameters.procedure_codes.find(proc) != std::string::npos); } void prepare_needle() { needle.append(parameters.needle_size/2, 'a'); needle.append(1, 'X'); needle.append(parameters.needle_size - needle.size(), 'a'); } void prepare_input() { const size_t padding = 256; switch (parameters.test_type) { case TestType::OptimisticCase: input.assign(parameters.needle_position, '_'); break; case TestType::WorstCase: input.assign(parameters.needle_position, 'a'); break; case TestType::Random: for (size_t i=0; i < parameters.needle_position; i++) { const char c = rand() % ('z' - 'a' + 1) + 'a'; input.push_back(c); } break; } input += needle; input.append(padding, '_'); // to make sure that memory after the needle is accessible } void prepare() { prepare_needle(); prepare_input(); } std::string needle; std::string input; Parameters parameters; }; bool parse(int argc, char* argv[], Application::Parameters& p) { if (argc < 5) { return false; } for (int i=1; i < argc; i++) { const std::string tmp = argv[i]; if (tmp == "-h" || tmp == "--help") return false; } p.needle_position = atoi(argv[1]); p.needle_size = atoi(argv[2]); p.count = atoi(argv[3]); if (p.needle_size < 3) { throw std::runtime_error("needle size must be greater than 2"); } if (p.count == 0) { throw std::runtime_error("count must be greater than 0"); } std::string tmp(argv[4]); if (tmp == "optimistic") { p.test_type = Application::TestType::OptimisticCase; } else if (tmp == "worst") { p.test_type = Application::TestType::WorstCase; } else if (tmp == "random") { p.test_type = Application::TestType::Random; } else { throw std::runtime_error("expected 'optimistic', 'worst' or 'random', got '" + tmp + "'"); } if (argc >= 6) { p.procedure_codes = argv[5]; } return true; } int main(int argc, char* argv[]) { try { Application::Parameters params; if (!parse(argc, argv, params)) { Application::print_help(argv[0]); return EXIT_FAILURE; } Application app(params); return app() ? EXIT_SUCCESS : EXIT_FAILURE; } catch (std::runtime_error& err) { const auto msg = ansi::seq("Error", ansi::RED); printf("%s: %s\n", msg.data(), err.what()); return EXIT_FAILURE; } catch (ApplicationBase::Error& err) { const auto msg = ansi::seq("Error", ansi::RED); printf("%s: %s\n", msg.data(), err.message.data()); return EXIT_FAILURE; } } ================================================ FILE: src/benchmark.h ================================================ #ifndef _BENCHMARK_H_ #define _BENCHMARK_H_ #include #define RDTSC_START(cycles) \ do { \ uint32_t cyc_high, cyc_low; \ __asm volatile("cpuid\n" \ "rdtsc\n" \ "mov %%edx, %0\n" \ "mov %%eax, %1" : \ "=r" (cyc_high), \ "=r"(cyc_low) : \ : /* no read only */ \ "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ ); \ (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ } while (0) #define RDTSC_STOP(cycles) \ do { \ uint32_t cyc_high, cyc_low; \ __asm volatile("rdtscp\n" \ "mov %%edx, %0\n" \ "mov %%eax, %1\n" \ "cpuid" : \ "=r"(cyc_high), \ "=r"(cyc_low) : \ /* no read only registers */ : \ "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \ ); \ (cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \ } while (0) static __attribute__ ((noinline)) uint64_t rdtsc_overhead_func(uint64_t dummy) { return dummy; } uint64_t global_rdtsc_overhead = (uint64_t) UINT64_MAX; #define RDTSC_SET_OVERHEAD(test, repeat) \ do { \ uint64_t cycles_start, cycles_final, cycles_diff; \ uint64_t min_diff = UINT64_MAX; \ for (unsigned i = 0; i < repeat; i++) { \ __asm volatile("" ::: /* pretend to clobber */ "memory"); \ RDTSC_START(cycles_start); \ test; \ RDTSC_STOP(cycles_final); \ cycles_diff = (cycles_final - cycles_start); \ if (cycles_diff < min_diff) min_diff = cycles_diff; \ } \ global_rdtsc_overhead = min_diff; \ printf("rdtsc_overhead set to %d\n", (int)global_rdtsc_overhead); \ } while (0) \ /* * Prints the best number of operations per cycle where * test is the function call, answer is the expected answer generated by * test, repeat is the number of times we should repeat and size is the * number of operations represented by test. */ #define BEST_TIME(pre, test, test_name, repeat, size) \ do { \ if (global_rdtsc_overhead == UINT64_MAX) { \ RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \ } \ printf("%-30s\t: ", test_name); fflush(stdout); \ uint64_t cycles_start, cycles_final, cycles_diff; \ uint64_t min_diff = (uint64_t)-1; \ uint64_t sum_diff = 0; \ for (size_t i = 0; i < repeat; i++) { \ pre; \ __asm volatile("" ::: /* pretend to clobber */ "memory"); \ RDTSC_START(cycles_start); \ test; \ RDTSC_STOP(cycles_final); \ cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \ if (cycles_diff < min_diff) min_diff = cycles_diff; \ sum_diff += cycles_diff; \ } \ uint64_t S = size; \ float cycle_per_op = (min_diff) / (double)S; \ float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \ printf(" %8.3f cycle/op (best) %8.3f cycle/op (avg)\n", cycle_per_op, avg_cycle_per_op); \ } while (0) #endif ================================================ FILE: src/speedup.cpp ================================================ #include #include #include #include #include #include #include #include "all_procedures.cpp" // ------------------------------------------------------------------------ #include #include "application_base.cpp" class Application final: public ApplicationBase { Procedures db; std::size_t count; const std::string procedure_codes; public: struct Parameters { std::string file_name; std::string words_name; size_t count = 10; std::string procedure_codes; }; public: Application(const Parameters& params) : db(all_procedures()) , count(params.count) , procedure_codes(params.procedure_codes) { prepare(params.file_name, params.words_name); } bool operator()() { #if defined(__GNUC__) && !defined(HAVE_NEON_INSTRUCTIONS) // GNU std::string::find was proven to be utterly slow, // don't waste our time on reconfirming that fact. // // (On Raspberry Pi it's fast, though) const bool measure_stdstring = false; #else const bool measure_stdstring = true; #endif #if defined(HAVE_NEON_INSTRUCTIONS) && !defined(HAVE_AARCH64_ARCHITECTURE) // On Raspberry Pi it's terribly slow, but on Aarch64 // the 64-bit procedure is pretty fast const bool measure_swar64 = false; #else const bool measure_swar64 = true; #endif if (is_enabled('a')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return strstr_naive(s.data(), s.size(), neddle.data(), neddle.size()); }; measure(find, 'a'); } if (is_enabled('b')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { const char* res = strstr(s.data(), neddle.data()); if (res != nullptr) { return res - s.data(); } else { return std::string::npos; } }; measure(find, 'b'); } if (measure_stdstring && is_enabled('c')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return s.find(neddle); }; measure(find, 'c'); } if (measure_swar64 && is_enabled('d')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return swar64_strstr_v2(s, neddle); }; measure(find, 'd'); } if (is_enabled('e')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return swar32_strstr_v2(s, neddle); }; measure(find, 'e'); } #ifdef HAVE_SSE_INSTRUCTIONS if (is_enabled('f')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return sse2_strstr_v2(s, neddle); }; measure(find, 'f'); } if (is_enabled('g')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return sse4_strstr(s, neddle); }; measure(find, 'g'); } if (is_enabled('h')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return sse4_strstr_unrolled(s, neddle); }; measure(find, 'h'); } if (is_enabled('i')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return sse42_strstr(s, neddle); }; measure(find, 'i'); } if (is_enabled('j')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return sse_naive_strstr(s, neddle); }; measure(find, 'j'); } #endif #ifdef HAVE_AVX2_INSTRUCTIONS if (is_enabled('k')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx2_strstr(s, neddle); }; measure(find, 'k'); } if (is_enabled('l')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx2_strstr_v2(s, neddle); }; measure(find, 'l'); } if (is_enabled('m')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx2_naive_strstr(s, neddle); }; measure(find, 'm'); } if (is_enabled('n')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx2_naive_unrolled_strstr(s, neddle); }; measure(find, 'n'); } if (is_enabled('o')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx2_naive_strstr64(s, neddle); }; measure(find, 'o'); } #endif #ifdef HAVE_AVX512F_INSTRUCTIONS if (is_enabled('p')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx512f_strstr(s, neddle); }; measure(find, 'p'); } if (is_enabled('q')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx512f_strstr_v2(s, neddle); }; measure(find, 'q'); } #endif #ifdef HAVE_AVX512BW_INSTRUCTIONS if (is_enabled('r')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return avx512bw_strstr_v2(s, neddle); }; measure(find, 'r'); } #endif #ifdef HAVE_NEON_INSTRUCTIONS if (is_enabled('s')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return neon_strstr_v2(s, neddle); }; measure(find, 's'); } #endif #ifdef HAVE_AARCH64_ARCHITECTURE if (is_enabled('t')) { auto find = [](const std::string& s, const std::string& neddle) -> size_t { return aarch64_strstr_v2(s, neddle); }; measure(find, 't'); } #endif return true; } static void print_help(const char* progname) { std::printf("%s file words [count] [procedure]\n", progname); std::puts(""); std::puts("Parameters:"); std::puts(""); std::puts(" file - arbitrary file"); std::puts(" words - list of words in separate lines"); std::puts(" count - repeat count (optional, default = 10)"); std::puts(" procedure - letter(s) from square brackets (by default all functions are checked)"); std::puts(""); std::puts("Following procedures ara available:"); for (auto& item: all_procedures().procedures) { printf(" [%c] %s\n", item.code, item.name.c_str()); } } private: template void measure(T_FIND find, char code) { printf("%-40s... ", db[code].name.c_str()); fflush(stdout); size_t result = 0; const auto t1 = std::chrono::high_resolution_clock::now(); auto k = count; while (k != 0) { for (const auto& word: words) { result += find(file, word); } k--; } const auto t2 = std::chrono::high_resolution_clock::now(); const std::chrono::duration td = t2-t1; printf("reference result = %lu, time = %10.6f s\n", result, td.count()); } bool is_enabled(char proc) const { return (procedure_codes.empty()) || (procedure_codes.find(proc) != std::string::npos); } }; bool parse(int argc, char* argv[], Application::Parameters& p) { if (argc < 3) { return false; } for (int i=1; i < argc; i++) { const std::string tmp = argv[i]; if (tmp == "-h" || tmp == "--help") return false; } p.file_name = argv[1]; p.words_name = argv[2]; if (argc >= 4) { size_t tmp = atoi(argv[3]); if (tmp > 0) { p.count = tmp; } else { printf("repeat count '%s' invalid, keeping default %lu\n", argv[3], p.count); } } if (argc >= 5) { p.procedure_codes = argv[4]; } return true; } int main(int argc, char* argv[]) { try { Application::Parameters params; if (!parse(argc, argv, params)) { Application::print_help(argv[0]); return EXIT_FAILURE; } Application app(params); return app() ? EXIT_SUCCESS : EXIT_FAILURE; } catch (ApplicationBase::Error& err) { const auto msg = ansi::seq("Error: ", ansi::RED); printf("%s: %s\n", msg.data(), err.message.data()); return EXIT_FAILURE; } } ================================================ FILE: src/unittests.cpp ================================================ #include #include #include #include #include #include #include "all.h" #include #include "all_procedures.cpp" bool test(const char* name, str_find_fun strstr_function) { std::printf("%s... ", name); std::fflush(stdout); for (size_t size = 1; size < 64; size++) { const std::string neddle = "$" + std::string(size, 'x') + "#"; for (size_t n = 0; n < 3*16; n++) { const std::string prefix(n, '.'); for (size_t k = 0; k < 3*16; k++) { // '.' * k + '$' + 'x' * size + '#' + '.' * k const std::string suffix(k, '.'); const std::string str = prefix + neddle + suffix; const auto result = strstr_function(str.data(), str.size(), neddle.data(), neddle.size()); if (result != n) { printf("%s\n", ansi::seq("FAILED", ansi::RED).c_str()); printf(" string = '%s' (length %lu)\n", str.data(), str.size()); printf(" neddle = '%s' (length %lu)\n", neddle.data(), neddle.size()); printf(" expected result = %lu, actual result = %lu\n", n, result); return false; } } } } const auto msg = ansi::seq("OK", ansi::GREEN); printf("%s\n", msg.c_str()); return true; } int main() { int ret = EXIT_SUCCESS; puts("running unit tests"); auto db = all_procedures(); for (auto& item: db.procedures) { if (item.builtin) { continue; } if (!test(item.name.c_str(), item.proc)) { ret = EXIT_FAILURE; } } return ret; } ================================================ FILE: src/validate.cpp ================================================ #include #include #include #include #include #include // ------------------------------------------------------------------------ #include "all_procedures.cpp" // ------------------------------------------------------------------------ #include #include "application_base.cpp" class Application final: public ApplicationBase { public: Application(const std::string& file_name, const std::string& words_name) { prepare(file_name, words_name); } bool run() { const auto n = words.size(); auto db = all_procedures(); for (size_t i = 0; i < n; i++) { if (i % 100 == 0) { print_progress(i, n); } const auto& word = words[i]; const size_t reference = file.find(word); for (auto& item: db.procedures) { if (item.builtin) { continue; } const size_t result = item.proc(file.data(), file.size(), word.data(), word.size()); if (reference != result) { putchar('\n'); const auto msg = ansi::seq("ERROR", ansi::RED); printf("%s: std::find result = %lu, %s = %lu\n", msg.data(), reference, item.name.c_str(), result); printf("word: '%s' (length %lu)\n", word.data(), word.size()); return false; } } } print_progress(n, n); putchar('\n'); const auto msg = ansi::seq("OK", ansi::GREEN); printf("%s\n", msg.c_str()); return true; } static void print_help(const char* progname) { std::printf("usage: %s [file] [words]\n", progname); std::puts(""); std::puts("Search all words in a file using std::string::find and SSE4 procedure"); std::puts(""); std::puts("Parameters:"); std::puts(""); std::puts(" file - arbitrary file"); std::puts(" words - list of words in separate lines"); } private: void print_progress(size_t pos, size_t n) { printf("validating... %0.2f%% (%lu/%lu)\r", 100.0*pos/n, pos, n); fflush(stdout); } }; int main(int argc, char* argv[]) { if (argc == 3) { try { Application app(argv[1], argv[2]); const auto ret = app.run(); return ret ? EXIT_SUCCESS : EXIT_FAILURE; } catch (ApplicationBase::Error& err) { const auto msg = ansi::seq("Error: ", ansi::RED); printf("%s: %s\n", msg.data(), err.message.data()); return EXIT_FAILURE; } } else { Application::print_help(argv[0]); return EXIT_FAILURE; } } ================================================ FILE: sse-naive-strstr.cpp ================================================ // Method descibed in https://arxiv.org/pdf/1612.01506.pdf // // Implementation by Daniel Lemire // https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/extra/simd/substring/substring.c size_t FORCE_INLINE sse_naive_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { assert(k > 0); assert(n > 0); if (n == k) { return (memcmp(s, needle, k) == 0) ? 0 : std::string::npos; } for (size_t i = 0; i < n - k + 1; i += 16) { uint16_t found = 0xffff; for (size_t j = 0; (j < k) && (found != 0) ; ++j) { const __m128i textvector = _mm_loadu_si128((const __m128i *)(s + i + j)); const __m128i needlevector = _mm_set1_epi8(needle[j]); uint16_t bitmask = _mm_movemask_epi8(_mm_cmpeq_epi8(textvector, needlevector)); found = found & bitmask; } if (found != 0) { return i + __builtin_ctz(found); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse_naive_strstr(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } result = sse_naive_strstr_anysize(s, n, needle, k); if (result <= n - k) { return result; } else { return std::string::npos; } } // ------------------------------------------------------------------------ size_t sse_naive_strstr(const std::string& s, const std::string& needle) { return sse_naive_strstr(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: sse2-needle4.cpp ================================================ size_t FORCE_INLINE sse2_needle4(const char* s, size_t n, const char* needle, size_t k) { uint32_t u32; memcpy(&u32, needle, sizeof(u32)); const __m128i v_needle = _mm_set1_epi32(u32); const __m128i shuffle = _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); for (size_t i = 0; i < n - k + 1; i += 4) { // 1. load 7 bytes: // [abcd|efg?|????|????] uint64_t u64; memcpy(&u64, &s[i], sizeof(u64)); const __m128i t0 = _mm_cvtsi64x_si128(u64); // 2. make all possible 4-byte substrings // [abcd|bcde|cdef|defg] const __m128i t1 = _mm_shuffle_epi8(shuffle, t0); // 3. compare the 4-byte substrings with the needle const __m128i t2 = _mm_cmpeq_epi32(v_needle, t1); const int mask = _mm_movemask_ps((__m128)t2); if (mask != 0) { return i + __builtin_clz(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse2_strstr_needle4(const char* s, size_t n, const char* needle, size_t k) { if (k != 4) { return std::string::npos; } return sse2_needle4(s, n, needle, k); } // ------------------------------------------------------------------------ size_t sse2_strstr_needle4(const std::string& s, const std::string& needle) { return sse2_strstr_needle4(s.data(), s.size(), needle.data(), needle.size()); } size_t FORCE_INLINE sse2_needle4_v2(const char* s, size_t n, const char* needle, size_t k) { uint32_t u32; memcpy(&u32, needle, sizeof(u32)); const __m128i v_needle = _mm_set1_epi32(u32); const __m128i shuffle0 = _mm_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6); const __m128i shuffle1 = _mm_setr_epi8(4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10); for (size_t i = 0; i < n - k + 1; i += 8) { // 1. load 15 ytes: // [abcd|efgh|ijkl|????] const __m128i input = _mm_loadu_si128((const __m128i*)(s + i)); // 2a. make all possible 4-byte substrings // lo = [abcd|bcde|cdef|defg] const __m128i lo = _mm_shuffle_epi8(shuffle0, input); // hi = [efgh|fghi|ghij|hijk] const __m128i hi = _mm_shuffle_epi8(shuffle1, input); // 3. compare the 4-byte substrings with the needle const __m128i eq_lo = _mm_cmpeq_epi32(v_needle, lo); const __m128i eq_hi = _mm_cmpeq_epi32(v_needle, hi); // to perform single movemask in the main loop const __m128i t0 = _mm_or_si128(eq_lo, eq_hi); const int mask = _mm_movemask_ps((__m128)t0); if (mask != 0) { const int mask_lo = _mm_movemask_ps((__m128)eq_lo); if (mask_lo != 0) { return i + __builtin_clz(mask_lo); } else { return i + 4 + __builtin_clz(mask); } } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse2_strstr_needle4_v2(const char* s, size_t n, const char* needle, size_t k) { if (k != 4) { return std::string::npos; } return sse2_needle4_v2(s, n, needle, k); } // ------------------------------------------------------------------------ size_t sse2_strstr_needle4_v2(const std::string& s, const std::string& needle) { return sse2_strstr_needle4_v2(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: sse2-strstr.cpp ================================================ // implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html size_t FORCE_INLINE sse2_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { assert(k > 0); assert(n > 0); const __m128i first = _mm_set1_epi8(needle[0]); const __m128i last = _mm_set1_epi8(needle[k - 1]); for (size_t i = 0; i < n; i += 16) { const __m128i block_first = _mm_loadu_si128(reinterpret_cast(s + i)); const __m128i block_last = _mm_loadu_si128(reinterpret_cast(s + i + k - 1)); const __m128i eq_first = _mm_cmpeq_epi8(first, block_first); const __m128i eq_last = _mm_cmpeq_epi8(last, block_last); uint16_t mask = _mm_movemask_epi8(_mm_and_si128(eq_first, eq_last)); while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask); if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ template size_t FORCE_INLINE sse2_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { assert(k > 0); assert(n > 0); const __m128i first = _mm_set1_epi8(needle[0]); const __m128i last = _mm_set1_epi8(needle[k - 1]); for (size_t i = 0; i < n; i += 16) { const __m128i block_first = _mm_loadu_si128(reinterpret_cast(s + i)); const __m128i block_last = _mm_loadu_si128(reinterpret_cast(s + i + k - 1)); const __m128i eq_first = _mm_cmpeq_epi8(first, block_first); const __m128i eq_last = _mm_cmpeq_epi8(last, block_last); uint32_t mask = _mm_movemask_epi8(_mm_and_si128(eq_first, eq_last)); while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask); if (memcmp_fun(s + i + bitpos + 1, needle + 1)) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse2_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } switch (k) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, needle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: result = sse2_strstr_memcmp<2>(s, n, needle, always_true); break; case 3: result = sse2_strstr_memcmp<3>(s, n, needle, memcmp1); break; case 4: result = sse2_strstr_memcmp<4>(s, n, needle, memcmp2); break; case 5: result = sse2_strstr_memcmp<5>(s, n, needle, memcmp4); break; case 6: result = sse2_strstr_memcmp<6>(s, n, needle, memcmp4); break; case 7: result = sse2_strstr_memcmp<7>(s, n, needle, memcmp5); break; case 8: result = sse2_strstr_memcmp<8>(s, n, needle, memcmp6); break; case 9: result = sse2_strstr_memcmp<9>(s, n, needle, memcmp8); break; case 10: result = sse2_strstr_memcmp<10>(s, n, needle, memcmp8); break; case 11: result = sse2_strstr_memcmp<11>(s, n, needle, memcmp9); break; case 12: result = sse2_strstr_memcmp<12>(s, n, needle, memcmp10); break; default: result = sse2_strstr_anysize(s, n, needle, k); break; } if (result <= n - k) { return result; } else { return std::string::npos; } } // ------------------------------------------------------------------------ size_t sse2_strstr_v2(const std::string& s, const std::string& needle) { return sse2_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: sse4-strstr-unrolled.cpp ================================================ // Note: it appears that these specialized functions do not help. // But I decided to left them, just in case. // use functions/templates dealing with certain substring length //#define ENABLE_SSE4_LENGTH_SPECIALIZATIONS // When defined use sse4_strstr_unrolled_memcmp template, // otherwise use just sse4_strstr_unrolled_max20 and sse4_strstr_unrolled_max36 //#define ENABLE_SSE4_MEMCMP_TEMPLATES size_t sse4_strstr_unrolled_anysize(const char* s, size_t n, const char* needle, size_t needle_size) { assert(needle_size > 4); assert(n > 0); const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); const __m128i zeros = _mm_setzero_si128(); __m128i prev = _mm_loadu_si128(reinterpret_cast(s)); __m128i curr; for (size_t i = 0; i < n; i += 16) { curr = _mm_loadu_si128(reinterpret_cast(s + i + 16)); const __m128i data0 = prev; const __m128i data1 = _mm_alignr_epi8(curr, prev, 8); const __m128i result0 = _mm_mpsadbw_epu8(data0, prefix, 0); const __m128i result1 = _mm_mpsadbw_epu8(data1, prefix, 0); prev = curr; const __m128i result = _mm_packus_epi16(result0, result1); const __m128i cmp = _mm_cmpeq_epi8(result, zeros); unsigned mask = _mm_movemask_epi8(cmp); while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask); if (memcmp(s + i + bitpos + 4, needle + 4, needle_size - 4) == 0) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ template size_t sse4_strstr_unrolled_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { assert(k > 4); assert(n > 0); const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); const __m128i zeros = _mm_setzero_si128(); __m128i prev = _mm_loadu_si128(reinterpret_cast(s)); __m128i curr; for (size_t i = 0; i < n; i += 16) { curr = _mm_loadu_si128(reinterpret_cast(s + i + 16)); const __m128i data0 = prev; const __m128i data1 = _mm_alignr_epi8(curr, prev, 8); const __m128i result0 = _mm_mpsadbw_epu8(data0, prefix, 0); const __m128i result1 = _mm_mpsadbw_epu8(data1, prefix, 0); prev = curr; const __m128i result = _mm_packus_epi16(result0, result1); const __m128i cmp = _mm_cmpeq_epi8(result, zeros); unsigned mask = _mm_movemask_epi8(cmp); while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask); if (memcmp_fun(s + i + bitpos + 4, needle + 4)) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse4_strstr_unrolled_max20(const char* s, size_t n, const char* needle, size_t needle_size) { const __m128i zeros = _mm_setzero_si128(); const __m128i prefix = sse::load(needle); const __m128i suffix = sse::load(needle + 4); const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4); for (size_t i = 0; i < n; i += 8) { const __m128i data = sse::load(s + i); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; const __m128i str = sse::load(s + i + bitpos + 4); const __m128i cmp = _mm_cmpeq_epi8(str, suffix); if (_mm_testc_si128(cmp, suff_mask)) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse4_strstr_unrolled_max36(const char* s, size_t n, const char* needle, size_t needle_size) { const __m128i zeros = _mm_setzero_si128(); const __m128i prefix = sse::load(needle); const __m128i suffix1 = sse::load(needle + 4); const __m128i suffix2 = sse::load(needle + 16 + 4); const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4)); for (size_t i = 0; i < n; i += 8) { const __m128i data = sse::load(s + i); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1); const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2); const __m128i c3 = _mm_or_si128(c2, suff_mask); const __m128i tmp = _mm_and_si128(c1, c3); if (_mm_movemask_epi8(tmp) == 0xffff) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse4_strstr_unrolled_len3(const char* s, size_t n, const char* needle) { const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); const __m128i zeros = _mm_setzero_si128(); for (size_t i = 0; i < n; i += 8) { const __m128i data = _mm_loadu_si128(reinterpret_cast(s + i)); const __m128i lastbyte = _mm_cvtepu8_epi16(_mm_srli_si128(data, 3)); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(_mm_sub_epi16(result, lastbyte), zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; if (mask != 0) { return i + bits::get_first_bit_set(mask)/2; } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse4_strstr_unrolled_len4(const char* s, size_t n, const char* needle) { const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); const __m128i zeros = _mm_setzero_si128(); for (size_t i = 0; i < n; i += 8) { const __m128i data = _mm_loadu_si128(reinterpret_cast(s + i)); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp); if (mask != 0) { return i + bits::get_first_bit_set(mask)/2; } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse4_strstr_unrolled(const char* s, size_t n, const char* needle, size_t needle_size) { size_t result = std::string::npos; if (n < needle_size) { return result; } switch (needle_size) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, needle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: { const char* res = reinterpret_cast(strstr(s, needle)); return (res != nullptr) ? res - s : std::string::npos; } case 3: result = sse4_strstr_unrolled_len3(s, n, needle); break; case 4: result = sse4_strstr_unrolled_len4(s, n, needle); break; #ifdef ENABLE_SSE4_LENGTH_SPECIALIZATIONS #ifdef ENABLE_SSE4_MEMCMP_TEMPLATES case 5: result = sse4_strstr_unrolled_memcmp<5>(s, n, needle, memcmp1); break; case 6: result = sse4_strstr_unrolled_memcmp<6>(s, n, needle, memcmp2); break; case 7: result = sse4_strstr_unrolled_memcmp<7>(s, n, needle, memcmp3); break; case 8: result = sse4_strstr_unrolled_memcmp<8>(s, n, needle, memcmp4); break; case 9: result = sse4_strstr_unrolled_memcmp<9>(s, n, needle, memcmp5); break; case 10: result = sse4_strstr_unrolled_memcmp<10>(s, n, needle, memcmp6); break; case 11: result = sse4_strstr_unrolled_memcmp<11>(s, n, needle, memcmp7); break; case 12: result = sse4_strstr_unrolled_memcmp<12>(s, n, needle, memcmp8); break; case 13: result = sse4_strstr_unrolled_memcmp<13>(s, n, needle, memcmp9); break; case 14: result = sse4_strstr_unrolled_memcmp<14>(s, n, needle, memcmp10); break; #else case 5: case 6: case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: /* 5 .. 14 */ #endif // ENABLE_SSE4_MEMCMP_TEMPLATES case 15: case 16: case 17: case 18: case 19: case 20: /* 15..20 */ result = sse4_strstr_unrolled_max20(s, n, needle, needle_size); break; case 21: case 22: case 23: case 24: case 25: case 26: case 27: case 28: case 29: case 30: case 31: case 32: case 33: case 34: case 35: case 36: /* 21..36 */ result = sse4_strstr_unrolled_max36(s, n, needle, needle_size); break; #endif // ENABLE_SSE4_LENGTH_SPECIALIZATIONS default: result = sse4_strstr_unrolled_anysize(s, n, needle, needle_size); break; } if (result <= n - needle_size) { return result; } else { return std::string::npos; } } // -------------------------------------------------- size_t sse4_strstr_unrolled(const std::string& s, const std::string& needle) { return sse4_strstr_unrolled(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: sse4-strstr.cpp ================================================ size_t sse4_strstr_anysize(const char* s, size_t n, const char* needle, size_t needle_size) { assert(needle_size > 4); assert(n > 0); const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); const __m128i zeros = _mm_setzero_si128(); for (size_t i = 0; i < n; i += 8) { const __m128i data = _mm_loadu_si128(reinterpret_cast(s + i)); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; if (memcmp(s + i + bitpos + 4, needle + 4, needle_size - 4) == 0) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ template size_t sse4_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { assert(k > 4); assert(n > 0); const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); const __m128i zeros = _mm_setzero_si128(); for (size_t i = 0; i < n; i += 8) { const __m128i data = _mm_loadu_si128(reinterpret_cast(s + i)); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; if (memcmp_fun(s + i + bitpos + 4, needle + 4)) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse4_strstr_max20(const char* s, size_t n, const char* needle, size_t needle_size) { const __m128i zeros = _mm_setzero_si128(); const __m128i prefix = sse::load(needle); const __m128i suffix = sse::load(needle + 4); const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4); for (size_t i = 0; i < n; i += 8) { const __m128i data = sse::load(s + i); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; const __m128i str = sse::load(s + i + bitpos + 4); const __m128i cmp = _mm_cmpeq_epi8(str, suffix); if (_mm_testc_si128(cmp, suff_mask)) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse4_strstr_max36(const char* s, size_t n, const char* needle, size_t needle_size) { const __m128i zeros = _mm_setzero_si128(); const __m128i prefix = sse::load(needle); const __m128i suffix1 = sse::load(needle + 4); const __m128i suffix2 = sse::load(needle + 16 + 4); const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4)); for (size_t i = 0; i < n; i += 8) { const __m128i data = sse::load(s + i); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask)/2; const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1); const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2); const __m128i c3 = _mm_or_si128(c2, suff_mask); const __m128i tmp = _mm_and_si128(c1, c3); if (_mm_movemask_epi8(tmp) == 0xffff) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse4_strstr_len3(const char* s, size_t n, const char* needle) { const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); const __m128i zeros = _mm_setzero_si128(); for (size_t i = 0; i < n; i += 8) { const __m128i data = _mm_loadu_si128(reinterpret_cast(s + i)); const __m128i lastbyte = _mm_cvtepu8_epi16(_mm_srli_si128(data, 3)); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(_mm_sub_epi16(result, lastbyte), zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; if (mask != 0) { return i + bits::get_first_bit_set(mask)/2; } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse4_strstr_len4(const char* s, size_t n, const char* needle) { const __m128i prefix = _mm_loadu_si128(reinterpret_cast(needle)); const __m128i zeros = _mm_setzero_si128(); for (size_t i = 0; i < n; i += 8) { const __m128i data = _mm_loadu_si128(reinterpret_cast(s + i)); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(result, zeros); unsigned mask = _mm_movemask_epi8(cmp); if (mask != 0) { return i + bits::get_first_bit_set(mask)/2; } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse4_strstr(const char* s, size_t n, const char* needle, size_t needle_size) { size_t result = std::string::npos; if (n < needle_size) { return result; } switch (needle_size) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, needle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: { const char* res = reinterpret_cast(strstr(s, needle)); return (res != nullptr) ? res - s : std::string::npos; } case 3: result = sse4_strstr_len3(s, n, needle); break; case 4: result = sse4_strstr_len4(s, n, needle); break; #if 1 case 5: result = sse4_strstr_memcmp<5>(s, n, needle, memcmp1); break; case 6: result = sse4_strstr_memcmp<6>(s, n, needle, memcmp2); break; case 7: result = sse4_strstr_memcmp<7>(s, n, needle, memcmp3); break; case 8: result = sse4_strstr_memcmp<8>(s, n, needle, memcmp4); break; case 9: result = sse4_strstr_memcmp<9>(s, n, needle, memcmp5); break; case 10: result = sse4_strstr_memcmp<10>(s, n, needle, memcmp6); break; case 11: result = sse4_strstr_memcmp<11>(s, n, needle, memcmp7); break; case 12: result = sse4_strstr_memcmp<12>(s, n, needle, memcmp8); break; case 13: result = sse4_strstr_memcmp<13>(s, n, needle, memcmp9); break; case 14: result = sse4_strstr_memcmp<14>(s, n, needle, memcmp10); break; #else case 5: case 6: case 7: case 8: case 9: case 10: case 11: case 12: case 13: case 14: /* 5 .. 14 */ #endif case 15: case 16: case 17: case 18: case 19: case 20: /* 15..20 */ result = sse4_strstr_max20(s, n, needle, needle_size); break; case 21: case 22: case 23: case 24: case 25: case 26: case 27: case 28: case 29: case 30: case 31: case 32: case 33: case 34: case 35: case 36: /* 21..36 */ result = sse4_strstr_max36(s, n, needle, needle_size); break; default: result = sse4_strstr_anysize(s, n, needle, needle_size); break; } if (result <= n - needle_size) { return result; } else { return std::string::npos; } } // -------------------------------------------------- size_t sse4_strstr(const std::string& s, const std::string& needle) { return sse4_strstr(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: sse4.2-strstr.cpp ================================================ /* Usage of PCMPESTRM instruction from SSE 4.1 */ size_t FORCE_INLINE sse42_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { assert(k > 0); assert(n > 0); const __m128i N = _mm_loadu_si128((__m128i*)needle); for (size_t i = 0; i < n; i += 16) { const int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ORDERED | _SIDD_BIT_MASK; const __m128i D = _mm_loadu_si128((__m128i*)(s + i)); const __m128i res = _mm_cmpestrm(N, k, D, n - i, mode); uint64_t mask = _mm_cvtsi128_si64(res); while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask); // we know that at least the first character of needle matches if (memcmp(s + i + bitpos + 1, needle + 1, k - 1) == 0) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } template size_t FORCE_INLINE sse42_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { assert(k > 0); assert(n > 0); const __m128i N = _mm_loadu_si128((__m128i*)needle); for (size_t i = 0; i < n; i += 16) { const int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ORDERED | _SIDD_BIT_MASK; const __m128i D = _mm_loadu_si128((__m128i*)(s + i)); const __m128i res = _mm_cmpestrm(N, k, D, n - i, mode); uint64_t mask = _mm_cvtsi128_si64(res); while (mask != 0) { const auto bitpos = bits::get_first_bit_set(mask); if (memcmp_fun(s + i + bitpos + 1, needle + 1)) { return i + bitpos; } mask = bits::clear_leftmost_set(mask); } } return std::string::npos; } // ------------------------------------------------------------------------ size_t sse42_strstr(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } switch (k) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, needle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: result = sse42_strstr_memcmp<2>(s, n, needle, memcmp1); break; case 3: result = sse42_strstr_memcmp<3>(s, n, needle, memcmp2); break; case 4: result = sse42_strstr_memcmp<4>(s, n, needle, memcmp3); break; case 5: result = sse42_strstr_memcmp<5>(s, n, needle, memcmp4); break; case 6: result = sse42_strstr_memcmp<6>(s, n, needle, memcmp5); break; case 7: result = sse42_strstr_memcmp<7>(s, n, needle, memcmp6); break; case 8: result = sse42_strstr_memcmp<8>(s, n, needle, memcmp7); break; case 9: result = sse42_strstr_memcmp<9>(s, n, needle, memcmp8); break; case 10: result = sse42_strstr_memcmp<10>(s, n, needle, memcmp9); break; case 11: result = sse42_strstr_memcmp<11>(s, n, needle, memcmp10); break; case 12: result = sse42_strstr_memcmp<12>(s, n, needle, memcmp11); break; default: result = sse42_strstr_anysize(s, n, needle, k); break; } if (result <= n - k) { return result; } else { return std::string::npos; } } // ------------------------------------------------------------------------ size_t sse42_strstr(const std::string& s, const std::string& needle) { return sse42_strstr(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: swar32-strstr-v2.cpp ================================================ size_t FORCE_INLINE swar32_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { assert(k > 0); assert(n > 0); const uint32_t first = 0x01010101u * static_cast(needle[0]); const uint32_t last = 0x01010101u * static_cast(needle[k - 1]); uint32_t* block_first = reinterpret_cast(const_cast(s)); uint32_t* block_last = reinterpret_cast(const_cast(s + k - 1)); // 2. sequence scan for (auto i=0u; i < n; i+=4, block_first++, block_last++) { // 0 bytes in eq indicate matching chars const uint32_t eq = (*block_first ^ first) | (*block_last ^ last); // 7th bit set if lower 7 bits are zero const uint32_t t0 = (~eq & 0x7f7f7f7fu) + 0x01010101u; // 7th bit set if 7th bit is zero const uint32_t t1 = (~eq & 0x80808080u); uint32_t zeros = t0 & t1; size_t j = 0; while (zeros) { if (zeros & 0x80) { const char* substr = reinterpret_cast(block_first) + j + 1; if (memcmp(substr, needle + 1, k - 2) == 0) { return i + j; } } zeros >>= 8; j += 1; } } return std::string::npos; } template size_t FORCE_INLINE swar32_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { assert(n > 0); const uint32_t first = 0x01010101u * static_cast(needle[0]); const uint32_t last = 0x01010101u * static_cast(needle[k - 1]); uint32_t* block_first = reinterpret_cast(const_cast(s)); uint32_t* block_last = reinterpret_cast(const_cast(s + k - 1)); // 2. sequence scan for (auto i=0u; i < n; i+=4, block_first++, block_last++) { const uint32_t eq = (*block_first ^ first) | (*block_last ^ last); const uint32_t t0 = (~eq & 0x7f7f7f7fu) + 0x01010101u; const uint32_t t1 = (~eq & 0x80808080u); uint32_t zeros = t0 & t1; size_t j = 0; while (zeros) { if (zeros & 0x80) { const char* substr = reinterpret_cast(block_first) + j + 1; if (memcmp_fun(substr, needle + 1)) { return i + j; } } zeros >>= 8; j += 1; } } return std::string::npos; } // ------------------------------------------------------------------------ size_t swar32_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } switch (k) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, needle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: result = swar32_strstr_memcmp<2>(s, n, needle, always_true); break; case 3: result = swar32_strstr_memcmp<3>(s, n, needle, memcmp1); break; case 4: result = swar32_strstr_memcmp<4>(s, n, needle, memcmp2); break; case 5: // Note: use memcmp4 rather memcmp3, as the last character // of needle is already proven to be equal result = swar32_strstr_memcmp<5>(s, n, needle, memcmp4); break; case 6: result = swar32_strstr_memcmp<6>(s, n, needle, memcmp4); break; case 7: result = swar32_strstr_memcmp<7>(s, n, needle, memcmp5); break; case 8: result = swar32_strstr_memcmp<8>(s, n, needle, memcmp6); break; case 9: // Note: use memcmp8 rather memcmp7 for the same reason as above. result = swar32_strstr_memcmp<9>(s, n, needle, memcmp8); break; case 10: result = swar32_strstr_memcmp<10>(s, n, needle, memcmp8); break; case 11: result = swar32_strstr_memcmp<11>(s, n, needle, memcmp9); break; case 12: result = swar32_strstr_memcmp<12>(s, n, needle, memcmp10); break; default: result = swar32_strstr_anysize(s, n, needle, k); break; } if (result <= n - k) { return result; } else { return std::string::npos; } } size_t swar32_strstr_v2(const std::string& s, const std::string& needle) { return swar32_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: swar64-strstr-v2.cpp ================================================ size_t FORCE_INLINE swar64_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) { assert(k > 0); assert(n > 0); const uint64_t first = 0x0101010101010101llu * static_cast(needle[0]); const uint64_t last = 0x0101010101010101llu * static_cast(needle[k - 1]); uint64_t* block_first = reinterpret_cast(const_cast(s)); uint64_t* block_last = reinterpret_cast(const_cast(s + k - 1)); // 2. sequence scan for (auto i=0u; i < n; i+=8, block_first++, block_last++) { // 0 bytes in eq indicate matching chars const uint64_t eq = (*block_first ^ first) | (*block_last ^ last); // 7th bit set if lower 7 bits are zero const uint64_t t0 = (~eq & 0x7f7f7f7f7f7f7f7fllu) + 0x0101010101010101llu; // 7th bit set if 7th bit is zero const uint64_t t1 = (~eq & 0x8080808080808080llu); uint64_t zeros = t0 & t1; size_t j = 0; while (zeros) { if (zeros & 0x80) { const char* substr = reinterpret_cast(block_first) + j + 1; if (memcmp(substr, needle + 1, k - 2) == 0) { return i + j; } } zeros >>= 8; j += 1; } } return std::string::npos; } template size_t FORCE_INLINE swar64_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) { assert(n > 0); const uint64_t first = 0x0101010101010101llu * static_cast(needle[0]); const uint64_t last = 0x0101010101010101llu * static_cast(needle[k - 1]); uint64_t* block_first = reinterpret_cast(const_cast(s)); uint64_t* block_last = reinterpret_cast(const_cast(s + k - 1)); // 2. sequence scan for (auto i=0u; i < n; i+=8, block_first++, block_last++) { const uint64_t eq = (*block_first ^ first) | (*block_last ^ last); const uint64_t t0 = (~eq & 0x7f7f7f7f7f7f7f7fllu) + 0x0101010101010101llu; const uint64_t t1 = (~eq & 0x8080808080808080llu); uint64_t zeros = t0 & t1; size_t j = 0; while (zeros) { if (zeros & 0x80) { const char* substr = reinterpret_cast(block_first) + j + 1; if (memcmp_fun(substr, needle + 1)) { return i + j; } } zeros >>= 8; j += 1; } } return std::string::npos; } // ------------------------------------------------------------------------ size_t swar64_strstr_v2(const char* s, size_t n, const char* needle, size_t k) { size_t result = std::string::npos; if (n < k) { return result; } switch (k) { case 0: return 0; case 1: { const char* res = reinterpret_cast(strchr(s, needle[0])); return (res != nullptr) ? res - s : std::string::npos; } case 2: result = swar64_strstr_memcmp<2>(s, n, needle, always_true); break; case 3: result = swar64_strstr_memcmp<3>(s, n, needle, memcmp1); break; case 4: result = swar64_strstr_memcmp<4>(s, n, needle, memcmp2); break; case 5: // Note: use memcmp4 rather memcmp3, as the last character // of needle is already proven to be equal result = swar64_strstr_memcmp<5>(s, n, needle, memcmp4); break; case 6: result = swar64_strstr_memcmp<6>(s, n, needle, memcmp4); break; case 7: result = swar64_strstr_memcmp<7>(s, n, needle, memcmp5); break; case 8: result = swar64_strstr_memcmp<8>(s, n, needle, memcmp6); break; case 9: // Note: use memcmp8 rather memcmp7 for the same reason as above. result = swar64_strstr_memcmp<9>(s, n, needle, memcmp8); break; case 10: result = swar64_strstr_memcmp<10>(s, n, needle, memcmp8); break; case 11: result = swar64_strstr_memcmp<11>(s, n, needle, memcmp9); break; case 12: result = swar64_strstr_memcmp<12>(s, n, needle, memcmp10); break; default: result = swar64_strstr_anysize(s, n, needle, k); break; } if (result <= n - k) { return result; } else { return std::string::npos; } } size_t swar64_strstr_v2(const std::string& s, const std::string& needle) { return swar64_strstr_v2(s.data(), s.size(), needle.data(), needle.size()); } ================================================ FILE: utils/ansi.cpp ================================================ namespace ansi { const int RED = 31; const int GREEN = 32; const int WHITE = 37; std::string seq(const std::string& str, int color) { return "\033[" + std::to_string(color) + "m" + str + "\033[0m"; } } // namespace ansi ================================================ FILE: utils/avx2.cpp ================================================ namespace avx2 { union proxy { __m256i vec; uint8_t u8[32]; uint16_t u16[16]; }; namespace dump { void epu16(const __m256i vec) { proxy p; p.vec = vec; for (int i=0; i < 16; i++) { printf("%04x ", p.u16[i]); } putchar('\n'); } void epu8(const __m256i vec) { proxy p; p.vec = vec; putchar('\''); for (int i=0; i < 32; i++) { printf("%02x ", p.u8[i]); } putchar('\''); putchar('\n'); } } // namespace dump } // namespace sse ================================================ FILE: utils/avx512.cpp ================================================ namespace avx512 { union proxy { __m512i vec; uint8_t u8[64]; uint16_t u16[32]; }; namespace dump { void epu16(const __m512i vec) { proxy p; p.vec = vec; for (int i=0; i < 32; i++) { printf("%04x ", p.u16[i]); } putchar('\n'); } void epu8(const __m512i vec) { proxy p; p.vec = vec; putchar('\''); for (int i=0; i < 64; i++) { printf("%02x ", p.u8[i]); } putchar('\''); putchar('\n'); } } // namespace dump } // namespace sse ================================================ FILE: utils/bits.cpp ================================================ namespace bits { template T clear_leftmost_set(const T value) { assert(value != 0); return value & (value - 1); } template unsigned get_first_bit_set(const T value) { assert(value != 0); return __builtin_ctz(value); } template <> unsigned get_first_bit_set(const uint64_t value) { assert(value != 0); return __builtin_ctzl(value); } } // namespace bits ================================================ FILE: utils/neon.cpp ================================================ namespace neon { namespace dump { void epu8(const uint8x16_t vec) { uint8_t p[16]; vst1q_u8(p, vec); putchar('\''); for (int i=0; i < 16; i++) { printf("%02x ", p[i]); } putchar('\''); putchar('\n'); } void epu8(const uint8x8_t vec) { uint8_t p[8]; vst1_u8(p, vec); putchar('\''); for (int i=0; i < 8; i++) { printf("%02x ", p[i]); } putchar('\''); putchar('\n'); } } // namespace dump } // namespace sse ================================================ FILE: utils/sse.cpp ================================================ namespace sse { template __m128i load(T ptr) { return _mm_loadu_si128(reinterpret_cast(ptr)); } __m128i mask_lower_bytes(size_t n) { // assert(n < 16) static const uint8_t mask[32] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }; return load(mask + 16 - n); } __m128i mask_higher_bytes(size_t n) { // assert(n < 16) static const uint8_t mask[32] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, }; return load(mask + 16 - n); } union proxy { __m128i vec; uint8_t u8[16]; uint16_t u16[8]; }; namespace dump { void epu16(const __m128i vec) { proxy p; p.vec = vec; for (int i=0; i < 8; i++) { printf("%04x ", p.u16[i]); } putchar('\n'); } void epu8(const __m128i vec) { proxy p; p.vec = vec; putchar('\''); for (int i=0; i < 16; i++) { printf("%02x ", p.u8[i]); } putchar('\''); putchar('\n'); } } // namespace dump } // namespace sse