Showing preview only (200K chars total). Download the full file or copy to clipboard to get everything.
Repository: WojciechMula/sse4-strstr
Branch: master
Commit: 9cdc4b6df817
Files: 55
Total size: 186.5 KB
Directory structure:
gitextract_jd4noqk0/
├── .gitignore
├── LICENSE
├── Makefile
├── README.rst
├── aarch64-strstr-v2.cpp
├── avx2-naive-strstr.cpp
├── avx2-naive-strstr64.cpp
├── avx2-naive-unrolled-strstr.cpp
├── avx2-strstr-v2-clang-specific.cpp
├── avx2-strstr-v2.cpp
├── avx2-strstr.cpp
├── avx512bw-strstr-v2.cpp
├── avx512bw-strstr-v3.cpp
├── avx512f-strstr-v2.cpp
├── avx512f-strstr.cpp
├── common.h
├── data/
│ └── placeholder
├── fixed-memcmp.cpp
├── make_words.sh
├── neon-strstr-v2.cpp
├── original/
│ ├── sse4_strstr-test.py
│ └── sse4_strstr.c
├── results/
│ ├── armv7-32bit-gcc4.9.2.txt
│ ├── armv8-64bit-clang3.8.0.txt
│ ├── bulldozer-fx-8510-gcc4.8.4-sse.txt
│ ├── cascadelake-Gold-5217-gcc-7.4.0-avx512bw.txt
│ ├── haswell-i7-4770-gcc5.4.1-avx2.txt
│ ├── knights-landing-7210-gcc5.3.0-avx512f.txt
│ ├── postprocess.py
│ ├── skylake-i7-6700-gcc5.4.1-avx2.txt
│ ├── skylake-i9-7900-gcc-5.4.1-avx512bw.txt
│ └── westmere-m540-gcc6.2.0-sse4.txt
├── scalar.cpp
├── src/
│ ├── all.h
│ ├── all_procedures.cpp
│ ├── application_base.cpp
│ ├── benchmark.cpp
│ ├── benchmark.h
│ ├── speedup.cpp
│ ├── unittests.cpp
│ └── validate.cpp
├── sse-naive-strstr.cpp
├── sse2-needle4.cpp
├── sse2-strstr.cpp
├── sse4-strstr-unrolled.cpp
├── sse4-strstr.cpp
├── sse4.2-strstr.cpp
├── swar32-strstr-v2.cpp
├── swar64-strstr-v2.cpp
└── utils/
├── ansi.cpp
├── avx2.cpp
├── avx512.cpp
├── bits.cpp
├── neon.cpp
└── sse.cpp
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
speedup_sse4
benchmark_sse4
unittests_sse4
validate_sse4
speedup_avx2
benchmark_avx2
unittests_avx2
validate_avx2
speedup_avx512f
benchmark_avx512f
unittests_avx512f
validate_avx512f
unittests_avx512bw
benchmark_avx512bw
validate_avx512bw
speedup_avx512bw
speedup_arm
unittests_arm
validate_arm
speedup_aarch64
unittests_aarch64
validate_aarch64
data/i386.txt
data/words
tags
================================================
FILE: LICENSE
================================================
Copyright (c) 2008-2016, Wojciech Muła
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
================================================
FILE: Makefile
================================================
.PHONY: all clean compile_intel
FLAGS=-std=c++11 -O3 -Wall -Wextra -pedantic -I. $(CXXFLAGS)
FLAGS_INTEL=$(FLAGS) -DHAVE_SSE_INSTRUCTIONS
FLAGS_SSE4=$(FLAGS_INTEL) -msse4.2
FLAGS_AVX2=$(FLAGS_INTEL) -mavx2 -DHAVE_AVX2_INSTRUCTIONS
FLAGS_AVX512F=$(FLAGS_INTEL) -mavx512f -DHAVE_AVX2_INSTRUCTIONS -DHAVE_AVX512F_INSTRUCTIONS
FLAGS_AVX512BW=$(FLAGS_INTEL) -mavx512bw -DHAVE_AVX2_INSTRUCTIONS -DHAVE_AVX512F_INSTRUCTIONS -DHAVE_AVX512BW_INSTRUCTIONS
FLAGS_ARM=$(FLAGS) -mfpu=neon -DHAVE_NEON_INSTRUCTIONS
FLAGS_AARCH64=$(FLAGS) -DHAVE_NEON_INSTRUCTIONS -DHAVE_AARCH64_ARCHITECTURE
DEPS=utils/ansi.cpp utils/bits.cpp common.h fixed-memcmp.cpp
DEPS_SCALAR=swar64-strstr-v2.cpp swar32-strstr-v2.cpp scalar.cpp
DEPS_SSE4=sse4-strstr.cpp sse4-strstr-unrolled.cpp sse4.2-strstr.cpp sse2-strstr.cpp sse-naive-strstr.cpp sse2-needle4.cpp utils/sse.cpp $(DEPS) $(DEPS_SCALAR)
DEPS_AVX2=avx2-*.cpp utils/avx2.cpp $(DEPS_SSE4)
DEPS_AVX512F=avx512f-*.cpp utils/avx512.cpp $(DEPS_AVX2)
DEPS_AVX512BW=avx512bw-*.cpp utils/avx512.cpp $(DEPS_AVX512F)
DEPS_ARM=neon-strstr-v2.cpp $(DEPS) $(DEPS_SCALAR)
DEPS_AARCH64=aarch64-strstr-v2.cpp $(DEPS_ARM)
ALL_INTEL=\
validate_sse4 \
speedup_sse4 \
benchmark_sse4 \
unittests_sse4 \
validate_avx2 \
speedup_avx2 \
benchmark_avx2 \
unittests_avx2 \
validate_avx512f \
speedup_avx512f \
benchmark_avx512f \
unittests_avx512f \
speedup_avx512bw \
benchmark_avx512bw \
validate_avx512bw \
unittests_avx512bw \
ALL_ARM=\
validate_arm \
unittests_arm \
speedup_arm
ALL_AARCH64=\
validate_aarch64 \
unittests_aarch64 \
speedup_aarch64
ALL=$(ALL_INTEL) $(ALL_ARM) $(ALL_AARCH64)
all:
@echo "select target test_ARCH or run_ARCH"
@echo
@echo "test_ARCH runs unit and validation tests"
@echo "run_ARCH runs performance tests"
@echo
@echo "ARCH might be:"
@echo "* sse4"
@echo "* avx2"
@echo "* avx512f"
@echo "* avx512bw"
@echo "* arm"
@echo "* aarch64"
build_intel: $(ALL_INTEL)
build_arm: $(ALL_ARM)
build_aarch64: $(ALL_AARCH64)
UNITTESTS_DEPS=src/unittests.cpp src/all_procedures.cpp
VALIDATE_DEPS=src/validate.cpp src/application_base.cpp src/all_procedures.cpp
SPEEDUP_DEPS=src/speedup.cpp src/application_base.cpp src/all_procedures.cpp
BENCHMARK_DEPS=src/benchmark.cpp src/benchmark.h src/application_base.cpp src/all_procedures.cpp
validate_sse4: $(VALIDATE_DEPS) $(DEPS_SSE4)
$(CXX) $(FLAGS_SSE4) src/validate.cpp -o $@
speedup_sse4: $(SPEEDUP_DEPS) $(DEPS_SSE4)
$(CXX) $(FLAGS_SSE4) -DNDEBUG src/speedup.cpp -o $@
benchmark_sse4: $(BENCHMARK_DEPS) $(DEPS_SSE4)
$(CXX) $(FLAGS_SSE4) -DNDEBUG src/benchmark.cpp -o $@
unittests_sse4: $(UNITTESTS_DEPS) $(DEPS_SSE4)
$(CXX) $(FLAGS_SSE4) src/unittests.cpp -o $@
validate_avx2: $(VALIDATE_DEPS) $(DEPS_AVX2)
$(CXX) $(FLAGS_AVX2) src/validate.cpp -o $@
speedup_avx2: $(SPEEDUP_DEPS) $(DEPS_AVX2)
$(CXX) $(FLAGS_AVX2) -DNDEBUG src/speedup.cpp -o $@
benchmark_avx2: $(BENCHMARK_DEPS) $(DEPS_SSE4)
$(CXX) $(FLAGS_AVX2) -DNDEBUG src/benchmark.cpp -o $@
unittests_avx2: $(UNITTESTS_DEPS) $(DEPS_AVX2)
$(CXX) $(FLAGS_AVX2) src/unittests.cpp -o $@
validate_avx512f: $(VALIDATE_DEPS) $(DEPS_AVX512F)
$(CXX) $(FLAGS_AVX512F) src/validate.cpp -o $@
benchmark_avx512f: $(BENCHMARK_DEPS) $(DEPS_SSE4)
$(CXX) $(FLAGS_AVX512F) -DNDEBUG src/benchmark.cpp -o $@
speedup_avx512f: $(SPEEDUP_DEPS) $(DEPS_AVX512F)
$(CXX) $(FLAGS_AVX512F) -DNDEBUG src/speedup.cpp -o $@
unittests_avx512f: $(UNITTESTS_DEPS) $(DEPS_AVX512F)
$(CXX) $(FLAGS_AVX512F) src/unittests.cpp -o $@
validate_avx512bw: $(VALIDATE_DEPS) $(DEPS_AVX512BW)
$(CXX) $(FLAGS_AVX512BW) src/validate.cpp -o $@
speedup_avx512bw: $(SPEEDUP_DEPS) $(DEPS_AVX512BW)
$(CXX) $(FLAGS_AVX512BW) -DNDEBUG src/speedup.cpp -o $@
benchmark_avx512bw: $(BENCHMARK_DEPS) $(DEPS_SSE4)
$(CXX) $(FLAGS_AVX512BW) -DNDEBUG src/benchmark.cpp -o $@
unittests_avx512bw: $(UNITTESTS_DEPS) $(DEPS_AVX512BW)
$(CXX) $(FLAGS_AVX512BW) src/unittests.cpp -o $@
validate_arm: $(VALIDATE_DEPS) $(DEPS_ARM)
$(CXX) $(FLAGS_ARM) src/validate.cpp -o $@
speedup_arm: $(SPEEDUP_DEPS) $(DEPS_ARM)
$(CXX) $(FLAGS_ARM) -DNDEBUG src/speedup.cpp -o $@
unittests_arm: $(UNITTESTS_DEPS) $(DEPS_ARM)
$(CXX) $(FLAGS_ARM) src/unittests.cpp -o $@
validate_aarch64: $(VALIDATE_DEPS) $(DEPS_AARCH64)
$(CXX) $(FLAGS_AARCH64) src/validate.cpp -o $@
speedup_aarch64: $(SPEEDUP_DEPS) $(DEPS_AARCH64)
$(CXX) $(FLAGS_AARCH64) -DNDEBUG src/speedup.cpp -o $@
unittests_aarch64: $(UNITTESTS_DEPS) $(DEPS_ARM)
$(CXX) $(FLAGS_AARCH64) src/unittests.cpp -o $@
data/i386.txt:
wget http://css.csail.mit.edu/6.858/2013/readings/i386.txt
mv i386.txt data/i386.txt
data/words: data/i386.txt
sh make_words.sh $^ $@
test_sse4: unittests_sse4 validate_sse4 data/words data/i386.txt
./unittests_sse4
./validate_sse4 data/i386.txt data/words
run_sse4: speedup_sse4 data/words data/i386.txt
./speedup_sse4 data/i386.txt data/words
test_avx2: unittests_avx2 validate_avx2 data/words data/i386.txt
./unittests_avx2
./validate_avx2 data/i386.txt data/words
run_avx2: speedup_avx2 data/words data/i386.txt
./speedup_avx2 data/i386.txt data/words
test_avx512f: unittests_avx512f validate_avx512f data/words data/i386.txt
./unittests_avx512f
./validate_avx512f data/i386.txt data/words
run_avx512f: speedup_avx512f data/words data/i386.txt
./speedup_avx512f data/i386.txt data/words
run_avx512bw: speedup_avx512bw data/words data/i386.txt
./speedup_avx512bw data/i386.txt data/words
test_avx512bw: unittests_avx512bw validate_avx512bw data/words data/i386.txt
./unittests_avx512bw
./validate_avx512bw data/i386.txt data/words
test_arm: unittests_arm validate_arm data/words data/i386.txt
./unittests_arm
./validate_arm data/i386.txt data/words
run_arm: speedup_arm data/words data/i386.txt
# my Raspberry Pi is slow, repeat count = 1 is enough
./$< data/i386.txt data/words 1
test_aarch64: unittests_aarch64 validate_aarch64 data/words data/i386.txt
./unittests_aarch64
./validate_aarch64 data/i386.txt data/words
run_aarch64: speedup_aarch64 data/words data/i386.txt
./$< data/i386.txt data/words 1
compile_intel: $(ALL_INTEL)
clean:
rm -f $(ALL)
================================================
FILE: README.rst
================================================
================================================================================
SIMD-friendly algorithms for substring searching
================================================================================
Sample programs for article "SIMD-friendly algorithms for substring searching"
(http://0x80.pl/articles/simd-strfind.html).
The **root directory** contains C++11 procedures implemented using intrinsics
for SSE, SSE4, AVX2, AVX512F, AVX512BW and ARM Neon (both ARMv7 and ARMv8).
The subdirectory **original** contains 32-bit programs with inline assembly,
written in 2008 for another article__.
__ http://0x80.pl/articles/sse4_substring_locate.html
Usage
------------------------------------------------------------------------
To run unit and validation tests type ``make test_ARCH``, to run
performance tests type ``make run_ARCH``. Value ``ARCH`` selectes
the CPU architecture:
* sse4,
* avx2,
* avx512f,
* avx512bw,
* arm,
* aarch64.
Performance results
------------------------------------------------------------------------
The subdirectory ``results`` contains raw timings from various computers.
================================================
FILE: aarch64-strstr-v2.cpp
================================================
size_t FORCE_INLINE aarch64_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {
assert(k > 0);
assert(n > 0);
const uint8x16_t first = vdupq_n_u8(needle[0]);
const uint8x16_t last = vdupq_n_u8(needle[k - 1]);
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(s);
for (size_t i = 0; i < n; i += 16) {
const uint8x16_t block_first = vld1q_u8(ptr + i);
const uint8x16_t block_last = vld1q_u8(ptr + i + k - 1);
const uint8x16_t eq_first = vceqq_u8(first, block_first);
const uint8x16_t eq_last = vceqq_u8(last, block_last);
const uint8x16_t pred_16 = vandq_u8(eq_first, eq_last);
uint64_t mask;
mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 0);
if (mask) {
for (int j=0; j < 8; j++) {
if ((mask & 0xff) && (memcmp(s + i + j + 1, needle + 1, k - 2) == 0)) {
return i + j;
}
mask >>= 8;
}
}
mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 1);
if (mask) {
for (int j=0; j < 8; j++) {
if ((mask & 0xff) && (memcmp(s + i + j + 8 + 1, needle + 1, k - 2) == 0)) {
return i + j + 8;
}
mask >>= 8;
}
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
template <size_t k, typename MEMCMP>
size_t FORCE_INLINE aarch64_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {
assert(k > 0);
assert(n > 0);
const uint8x16_t first = vdupq_n_u8(needle[0]);
const uint8x16_t last = vdupq_n_u8(needle[k - 1]);
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(s);
for (size_t i = 0; i < n; i += 16) {
const uint8x16_t block_first = vld1q_u8(ptr + i);
const uint8x16_t block_last = vld1q_u8(ptr + i + k - 1);
const uint8x16_t eq_first = vceqq_u8(first, block_first);
const uint8x16_t eq_last = vceqq_u8(last, block_last);
const uint8x16_t pred_16 = vandq_u8(eq_first, eq_last);
uint64_t mask;
int j;
mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 0);
j = 0;
while (mask) {
if ((mask & 0xff) && (memcmp_fun(s + i + j + 1, needle + 1))) {
return i + j;
}
mask >>= 8;
j += 1;
}
mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 1);
j = 0;
while (mask) {
if ((mask & 0xff) && (memcmp_fun(s + i + j + 8 + 1, needle + 1))) {
return i + j + 8;
}
mask >>= 8;
j += 1;
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t aarch64_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
switch (k) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2:
result = aarch64_strstr_memcmp<2>(s, n, needle, always_true);
break;
case 3:
result = aarch64_strstr_memcmp<3>(s, n, needle, memcmp1);
break;
case 4:
result = aarch64_strstr_memcmp<4>(s, n, needle, memcmp2);
break;
case 5:
result = aarch64_strstr_memcmp<5>(s, n, needle, memcmp4);
break;
case 6:
result = aarch64_strstr_memcmp<6>(s, n, needle, memcmp4);
break;
case 7:
result = aarch64_strstr_memcmp<7>(s, n, needle, memcmp5);
break;
case 8:
result = aarch64_strstr_memcmp<8>(s, n, needle, memcmp6);
break;
case 9:
result = aarch64_strstr_memcmp<9>(s, n, needle, memcmp8);
break;
case 10:
result = aarch64_strstr_memcmp<10>(s, n, needle, memcmp8);
break;
case 11:
result = aarch64_strstr_memcmp<11>(s, n, needle, memcmp9);
break;
case 12:
result = aarch64_strstr_memcmp<12>(s, n, needle, memcmp10);
break;
default:
result = aarch64_strstr_anysize(s, n, needle, k);
break;
}
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
// ------------------------------------------------------------------------
size_t aarch64_strstr_v2(const std::string& s, const std::string& needle) {
return aarch64_strstr_v2(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: avx2-naive-strstr.cpp
================================================
// Method descibed in https://arxiv.org/pdf/1612.01506.pdf
//
// Implementation by Daniel Lemire
// https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/extra/simd/substring/substring.c
size_t FORCE_INLINE avx2_naive_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {
assert(k > 0);
assert(n > 0);
if (n == k) {
return (memcmp(s, needle, k) == 0) ? 0 : std::string::npos;
}
for (size_t i = 0; i < n - k + 1; i += 32) {
uint32_t found = 0xffffffff;
for (size_t j = 0; (j < k) && (found != 0) ; ++j) {
const __m256i textvector = _mm256_loadu_si256((const __m256i *)(s + i + j));
const __m256i needlevector = _mm256_set1_epi8(needle[j]);
uint32_t bitmask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(textvector, needlevector));
found = found & bitmask;
}
if (found != 0) {
return i + __builtin_ctz(found);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t avx2_naive_strstr(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
result = avx2_naive_strstr_anysize(s, n, needle, k);
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
// ------------------------------------------------------------------------
size_t avx2_naive_strstr(const std::string& s, const std::string& needle) {
return avx2_naive_strstr(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: avx2-naive-strstr64.cpp
================================================
// Method descibed in https://arxiv.org/pdf/1612.01506.pdf
//
// Implementation by Daniel Lemire
// https://github.com/WojciechMula/sse4-strstr/issues/2
size_t FORCE_INLINE avx2_naive_strstr_anysize64(const char* s, size_t n, const char* needle, size_t k) {
assert(k > 0);
assert(n > 0);
const __m256i first = _mm256_set1_epi8(needle[0]);
const __m256i last = _mm256_set1_epi8(needle[k - 1]);
for (size_t i = 0; i < n; i += 64) {
const __m256i block_first1 = _mm256_loadu_si256((const __m256i*)(s + i));
const __m256i block_last1 = _mm256_loadu_si256((const __m256i*)(s + i + k - 1));
const __m256i block_first2 = _mm256_loadu_si256((const __m256i*)(s + i + 32));
const __m256i block_last2 = _mm256_loadu_si256((const __m256i*)(s + i + k - 1 + 32));
const __m256i eq_first1 = _mm256_cmpeq_epi8(first, block_first1);
const __m256i eq_last1 = _mm256_cmpeq_epi8(last, block_last1);
const __m256i eq_first2 = _mm256_cmpeq_epi8(first, block_first2);
const __m256i eq_last2 = _mm256_cmpeq_epi8(last, block_last2);
const uint32_t mask1 = _mm256_movemask_epi8(_mm256_and_si256(eq_first1, eq_last1));
const uint32_t mask2 = _mm256_movemask_epi8(_mm256_and_si256(eq_first2, eq_last2));
uint64_t mask = mask1 | ((uint64_t)mask2 << 32);
while (mask != 0) {
const int bitpos = __builtin_ctzll(mask);
if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t avx2_naive_strstr64(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
result = avx2_naive_strstr_anysize64(s, n, needle, k);
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
// ------------------------------------------------------------------------
size_t avx2_naive_strstr64(const std::string& s, const std::string& needle) {
return avx2_naive_strstr64(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: avx2-naive-unrolled-strstr.cpp
================================================
// Method described in https://arxiv.org/pdf/1612.01506.pdf
//
// Implementation by Daniel Lemire
size_t FORCE_INLINE avx2_naive_strstr_unrolled_anysize(const char* s, size_t n, const char* needle, size_t k) {
// assert(n % 32 == 0); // deliberately commented out
// todo: fix it so we can handle variable-length inputs and
// can catch matches at the end of the data.
for (size_t i = 0; i < n - k; i += 32) {
uint32_t found = 0xFFFFFFFF; // 32 1-bits
size_t j = 0;
for (; (j + 3 < k) && (found != 0) ; j += 4) {
__m256i textvector1 = _mm256_loadu_si256((const __m256i *)(s + i + j));
__m256i needlevector1 = _mm256_set1_epi8(needle[j]);
__m256i textvector2 = _mm256_loadu_si256((const __m256i *)(s + i + j + 1));
__m256i needlevector2 = _mm256_set1_epi8(needle[j + 1]);
__m256i cmp1 = _mm256_cmpeq_epi8(textvector1, needlevector1);
__m256i cmp2 = _mm256_cmpeq_epi8(textvector2, needlevector2);
__m256i textvector3 = _mm256_loadu_si256((const __m256i *)(s + i + j + 2));
__m256i needlevector3 = _mm256_set1_epi8(needle[j + 2]);
__m256i textvector4 = _mm256_loadu_si256((const __m256i *)(s + i + j + 3));
__m256i needlevector4 = _mm256_set1_epi8(needle[j + 3]);
__m256i cmp3 = _mm256_cmpeq_epi8(textvector3, needlevector3);
__m256i cmp4 = _mm256_cmpeq_epi8(textvector4, needlevector4);
__m256i cmp12 = _mm256_and_si256(cmp1,cmp2);
__m256i cmp34 = _mm256_and_si256(cmp3,cmp4);
uint32_t bitmask = _mm256_movemask_epi8(_mm256_and_si256(cmp12,cmp34));
found = found & bitmask;
}
for (; (j < k) && (found != 0) ; ++j) {
__m256i textvector = _mm256_loadu_si256((const __m256i *)(s + i + j));
__m256i needlevector = _mm256_set1_epi8(needle[j]);
uint32_t bitmask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(textvector, needlevector));
found = found & bitmask;
}
if(found != 0) {
// got a match... maybe
return i + __builtin_ctz(found);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t avx2_naive_unrolled_strstr(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
result = avx2_naive_strstr_unrolled_anysize(s, n, needle, k);
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
// ------------------------------------------------------------------------
size_t avx2_naive_unrolled_strstr(const std::string& s, const std::string& needle) {
return avx2_naive_unrolled_strstr(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: avx2-strstr-v2-clang-specific.cpp
================================================
/*
The following templates implement the loop, where K is a template parameter.
for (unsigned i=1; i < K; i++) {
const __m256i substring = _mm256_alignr_epi8(next1, curr, i);
eq = _mm256_and_si256(eq, _mm256_cmpeq_epi8(substring, broadcasted[i]));
}
Clang complains that the loop parameter `i` is a variable and it cannot be
applied as a parameter _mm256_alignr_epi8. GCC somehow deals with it.
*/
#ifdef __clang__
template <size_t K, int i, bool terminate>
struct inner_loop_aux;
template <size_t K, int i>
struct inner_loop_aux<K, i, false> {
void operator()(__m256i& eq, const __m256i& next1, const __m256i& curr, const __m256i (&broadcasted)[K]) {
const __m256i substring = _mm256_alignr_epi8(next1, curr, i);
eq = _mm256_and_si256(eq, _mm256_cmpeq_epi8(substring, broadcasted[i]));
inner_loop_aux<K, i + 1, i + 1 == K>()(eq, next1, curr, broadcasted);
}
};
template <size_t K, int i>
struct inner_loop_aux<K, i, true> {
void operator()(__m256i&, const __m256i&, const __m256i&, const __m256i (&)[K]) {
// nop
}
};
template <size_t K>
struct inner_loop {
void operator()(__m256i& eq, const __m256i& next1, const __m256i& curr, const __m256i (&broadcasted)[K]) {
static_assert(K > 0, "wrong value");
inner_loop_aux<K, 0, false>()(eq, next1, curr, broadcasted);
}
};
#endif
================================================
FILE: avx2-strstr-v2.cpp
================================================
// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html
size_t FORCE_INLINE avx2_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {
assert(k > 0);
assert(n > 0);
const __m256i first = _mm256_set1_epi8(needle[0]);
const __m256i last = _mm256_set1_epi8(needle[k - 1]);
for (size_t i = 0; i < n; i += 32) {
const __m256i block_first = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i));
const __m256i block_last = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i + k - 1));
const __m256i eq_first = _mm256_cmpeq_epi8(first, block_first);
const __m256i eq_last = _mm256_cmpeq_epi8(last, block_last);
uint32_t mask = _mm256_movemask_epi8(_mm256_and_si256(eq_first, eq_last));
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask);
if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
#include "avx2-strstr-v2-clang-specific.cpp"
template <size_t K>
size_t FORCE_INLINE avx2_strstr_eq(const char* s, size_t n, const char* needle) {
static_assert(K > 0 && K < 16, "K must be in range [1..15]");
assert(n > 0);
__m256i broadcasted[K];
for (unsigned i=0; i < K; i++) {
broadcasted[i] = _mm256_set1_epi8(needle[i]);
}
__m256i curr = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s));
for (size_t i = 0; i < n; i += 32) {
const __m256i next = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i + 32));
__m256i eq = _mm256_cmpeq_epi8(curr, broadcasted[0]);
// AVX2 palignr works on 128-bit lanes, thus some extra work is needed
//
// curr = [a, b] (2 x 128 bit)
// next = [c, d]
// substring = [palignr(b, a, i), palignr(c, b, i)]
__m256i next1;
next1 = _mm256_inserti128_si256(next1, _mm256_extracti128_si256(curr, 1), 0); // b
next1 = _mm256_inserti128_si256(next1, _mm256_extracti128_si256(next, 0), 1); // c
#ifndef __clang__
for (unsigned i=1; i < K; i++) {
const __m256i substring = _mm256_alignr_epi8(next1, curr, i);
eq = _mm256_and_si256(eq, _mm256_cmpeq_epi8(substring, broadcasted[i]));
}
#else
inner_loop<K>()(eq, next1, curr, broadcasted);
#endif
curr = next;
const uint32_t mask = _mm256_movemask_epi8(eq);
if (mask != 0) {
return i + bits::get_first_bit_set(mask);
}
}
return std::string::npos;
}
template <size_t k, typename MEMCMP>
size_t FORCE_INLINE avx2_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {
assert(k > 0);
assert(n > 0);
const __m256i first = _mm256_set1_epi8(needle[0]);
const __m256i last = _mm256_set1_epi8(needle[k - 1]);
for (size_t i = 0; i < n; i += 32) {
const __m256i block_first = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i));
const __m256i block_last = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i + k - 1));
const __m256i eq_first = _mm256_cmpeq_epi8(first, block_first);
const __m256i eq_last = _mm256_cmpeq_epi8(last, block_last);
uint32_t mask = _mm256_movemask_epi8(_mm256_and_si256(eq_first, eq_last));
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask);
if (memcmp_fun(s + i + bitpos + 1, needle + 1)) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t avx2_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
switch (k) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2:
result = avx2_strstr_eq<2>(s, n, needle);
break;
case 3:
result = avx2_strstr_memcmp<3>(s, n, needle, memcmp1);
break;
case 4:
result = avx2_strstr_memcmp<4>(s, n, needle, memcmp2);
break;
case 5:
// Note: use memcmp4 rather memcmp3, as the last character
// of needle is already proven to be equal
result = avx2_strstr_memcmp<5>(s, n, needle, memcmp4);
break;
case 6:
result = avx2_strstr_memcmp<6>(s, n, needle, memcmp4);
break;
case 7:
result = avx2_strstr_memcmp<7>(s, n, needle, memcmp5);
break;
case 8:
result = avx2_strstr_memcmp<8>(s, n, needle, memcmp6);
break;
case 9:
// Note: use memcmp8 rather memcmp7 for the same reason as above.
result = avx2_strstr_memcmp<9>(s, n, needle, memcmp8);
break;
case 10:
result = avx2_strstr_memcmp<10>(s, n, needle, memcmp8);
break;
case 11:
result = avx2_strstr_memcmp<11>(s, n, needle, memcmp9);
break;
case 12:
result = avx2_strstr_memcmp<12>(s, n, needle, memcmp10);
break;
default:
result = avx2_strstr_anysize(s, n, needle, k);
break;
}
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
// ------------------------------------------------------------------------
size_t avx2_strstr_v2(const std::string& s, const std::string& needle) {
return avx2_strstr_v2(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: avx2-strstr.cpp
================================================
size_t avx2_strstr_long(const char* s, size_t n, const char* neddle, size_t neddle_size) {
assert(neddle_size > 4);
assert(n > 0);
const uint32_t prefix32 = *reinterpret_cast<const uint32_t*>(neddle);
const __m256i prefix = _mm256_set1_epi32(prefix32);
const __m256i zeros = _mm256_setzero_si256();
const __m256i permute = _mm256_setr_epi32(
0, 1, 2, 0,
2, 3, 4, 0
);
for (size_t i = 0; i < n; i += 16) {
const __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i));
/*
[00|01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31]
lane | boundary
[00|01|02|03|04|05|06|07|08|09|10|11|??|??|??|??|08|09|10|11|12|13|14|15|16|17|18|19|??|??|??|??]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
*/
const __m256i data = _mm256_permutevar8x32_epi32(in, permute);
const __m256i result = _mm256_mpsadbw_epu8(data, prefix, 0);
const __m256i cmp = _mm256_cmpeq_epi16(result, zeros);
uint32_t mask = _mm256_movemask_epi8(cmp) & 0x55555555u;
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask)/2;
if (memcmp(s + i + bitpos + 4, neddle + 4, neddle_size - 4) == 0) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t avx2_strstr_len4(const char* s, size_t n, const char* neddle) {
assert(n > 0);
const uint32_t prefix32 = *reinterpret_cast<const uint32_t*>(neddle);
const __m256i prefix = _mm256_set1_epi32(prefix32);
const __m256i zeros = _mm256_setzero_si256();
const __m256i permute = _mm256_setr_epi32(
0, 1, 2, 0,
2, 3, 4, 0
);
for (size_t i = 0; i < n; i += 16) {
const __m256i in = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i));
const __m256i data = _mm256_permutevar8x32_epi32(in, permute);
const __m256i result = _mm256_mpsadbw_epu8(data, prefix, 0);
const __m256i cmp = _mm256_cmpeq_epi16(result, zeros);
const uint32_t mask = _mm256_movemask_epi8(cmp) & 0x55555555u;
if (mask != 0) {
return i + bits::get_first_bit_set(mask)/2;
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t avx2_strstr(const char* s, size_t n, const char* neddle, size_t neddle_size) {
size_t result = std::string::npos;
if (n < neddle_size) {
return result;
}
switch (neddle_size) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, neddle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2:
case 3:
{
const char* res = reinterpret_cast<const char*>(strstr(s, neddle));
return (res != nullptr) ? res - s : std::string::npos;
}
case 4:
result = avx2_strstr_len4(s, n, neddle);
break;
default:
result = avx2_strstr_long(s, n, neddle, neddle_size);
break;
}
if (result <= n - neddle_size) {
return result;
} else {
return std::string::npos;
}
}
// --------------------------------------------------
size_t avx2_strstr(const std::string& s, const std::string& neddle) {
return avx2_strstr(s.data(), s.size(), neddle.data(), neddle.size());
}
================================================
FILE: avx512bw-strstr-v2.cpp
================================================
// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html
size_t avx512bw_strstr_v2_anysize(const char* string, size_t n, const char* needle, size_t k) {
assert(n > 0);
assert(k > 0);
const __m512i first = _mm512_set1_epi8(needle[0]);
const __m512i last = _mm512_set1_epi8(needle[k - 1]);
char* haystack = const_cast<char*>(string);
char* end = haystack + n;
for (/**/; haystack < end; haystack += 64) {
const __m512i block_first = _mm512_loadu_si512(haystack + 0);
const __m512i block_last = _mm512_loadu_si512(haystack + k - 1);
uint64_t mask = _mm512_cmpeq_epi8_mask(block_first, first)
& _mm512_cmpeq_epi8_mask(block_last, last);
while (mask != 0) {
const uint64_t bitpos = bits::get_first_bit_set(mask);
const char* s = reinterpret_cast<const char*>(haystack);
if (memcmp(s + bitpos + 1, needle + 1, k - 2) == 0) {
return (s - string) + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return size_t(-1);
}
template <size_t k, typename MEMCMP>
size_t avx512bw_strstr_v2_memcmp(const char* string, size_t n, const char* needle, MEMCMP memeq_fun) {
assert(n > 0);
assert(k > 0);
const __m512i first = _mm512_set1_epi8(needle[0]);
const __m512i last = _mm512_set1_epi8(needle[k - 1]);
char* haystack = const_cast<char*>(string);
char* end = haystack + n;
for (/**/; haystack < end; haystack += 64) {
const __m512i block_first = _mm512_loadu_si512(haystack + 0);
const __m512i block_last = _mm512_loadu_si512(haystack + k - 1);
uint64_t mask = _mm512_cmpeq_epi8_mask(block_first, first)
& _mm512_cmpeq_epi8_mask(block_last, last);
while (mask != 0) {
const uint64_t bitpos = bits::get_first_bit_set(mask);
const char* s = reinterpret_cast<const char*>(haystack);
if (memeq_fun(s + bitpos + 1, needle + 1)) {
return (s - string) + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return size_t(-1);
}
// ------------------------------------------------------------------------
size_t avx512bw_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
switch (k) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2:
result = avx512bw_strstr_v2_memcmp<2>(s, n, needle, always_true);
break;
case 3:
result = avx512bw_strstr_v2_memcmp<3>(s, n, needle, memcmp1);
break;
case 4:
result = avx512bw_strstr_v2_memcmp<4>(s, n, needle, memcmp2);
break;
case 5:
result = avx512bw_strstr_v2_memcmp<5>(s, n, needle, memcmp3);
break;
case 6:
result = avx512bw_strstr_v2_memcmp<6>(s, n, needle, memcmp4);
break;
case 7:
result = avx512bw_strstr_v2_memcmp<7>(s, n, needle, memcmp5);
break;
case 8:
result = avx512bw_strstr_v2_memcmp<8>(s, n, needle, memcmp6);
break;
case 9:
result = avx512bw_strstr_v2_memcmp<9>(s, n, needle, memcmp7);
break;
case 10:
result = avx512bw_strstr_v2_memcmp<10>(s, n, needle, memcmp8);
break;
case 11:
result = avx512bw_strstr_v2_memcmp<11>(s, n, needle, memcmp9);
break;
case 12:
result = avx512bw_strstr_v2_memcmp<12>(s, n, needle, memcmp10);
break;
default:
result = avx512bw_strstr_v2_anysize(s, n, needle, k);
break;
}
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
// --------------------------------------------------
size_t avx512bw_strstr_v2(const std::string& s, const std::string& needle) {
return avx512bw_strstr_v2(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: avx512bw-strstr-v3.cpp
================================================
// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html
size_t avx512bw_strstr_v3_anysize(const char* string, size_t n, const char* needle, size_t k) {
assert(n > 0);
assert(k > 0);
const __m512i first = _mm512_set1_epi8(needle[0]);
const __m512i last = _mm512_set1_epi8(needle[k - 1]);
char* haystack = const_cast<char*>(string);
char* end = haystack + n;
for (/**/; haystack < end; haystack += 64) {
const __m512i block_first = _mm512_loadu_si512(haystack + 0);
const __mmask64 first_eq = _mm512_cmpeq_epi8_mask(block_first, first);
if (first_eq == 0)
continue;
const __m512i block_last = _mm512_loadu_si512(haystack + k - 1);
uint64_t mask = _mm512_mask_cmpeq_epi8_mask(first_eq, block_last, last);
while (mask != 0) {
const uint64_t bitpos = bits::get_first_bit_set(mask);
const char* s = reinterpret_cast<const char*>(haystack);
if (memcmp(s + bitpos + 1, needle + 1, k - 2) == 0) {
return (s - string) + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return size_t(-1);
}
template <size_t k, typename MEMCMP>
size_t avx512bw_strstr_v3_memcmp(const char* string, size_t n, const char* needle, MEMCMP memeq_fun) {
assert(n > 0);
assert(k > 0);
const __m512i first = _mm512_set1_epi8(needle[0]);
const __m512i last = _mm512_set1_epi8(needle[k - 1]);
char* haystack = const_cast<char*>(string);
char* end = haystack + n;
for (/**/; haystack < end; haystack += 64) {
const __m512i block_first = _mm512_loadu_si512(haystack + 0);
const __mmask64 first_eq = _mm512_cmpeq_epi8_mask(block_first, first);
if (first_eq == 0)
continue;
const __m512i block_last = _mm512_loadu_si512(haystack + k - 1);
uint64_t mask = _mm512_mask_cmpeq_epi8_mask(first_eq, block_last, last);
while (mask != 0) {
const uint64_t bitpos = bits::get_first_bit_set(mask);
const char* s = reinterpret_cast<const char*>(haystack);
if (memeq_fun(s + bitpos + 1, needle + 1)) {
return (s - string) + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return size_t(-1);
}
// ------------------------------------------------------------------------
size_t avx512bw_strstr_v3(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
switch (k) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2:
result = avx512bw_strstr_v3_memcmp<2>(s, n, needle, always_true);
break;
case 3:
result = avx512bw_strstr_v3_memcmp<3>(s, n, needle, memcmp1);
break;
case 4:
result = avx512bw_strstr_v3_memcmp<4>(s, n, needle, memcmp2);
break;
case 5:
result = avx512bw_strstr_v3_memcmp<5>(s, n, needle, memcmp3);
break;
case 6:
result = avx512bw_strstr_v3_memcmp<6>(s, n, needle, memcmp4);
break;
case 7:
result = avx512bw_strstr_v3_memcmp<7>(s, n, needle, memcmp5);
break;
case 8:
result = avx512bw_strstr_v3_memcmp<8>(s, n, needle, memcmp6);
break;
case 9:
result = avx512bw_strstr_v3_memcmp<9>(s, n, needle, memcmp7);
break;
case 10:
result = avx512bw_strstr_v3_memcmp<10>(s, n, needle, memcmp8);
break;
case 11:
result = avx512bw_strstr_v3_memcmp<11>(s, n, needle, memcmp9);
break;
case 12:
result = avx512bw_strstr_v3_memcmp<12>(s, n, needle, memcmp10);
break;
default:
result = avx512bw_strstr_v3_anysize(s, n, needle, k);
break;
}
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
// --------------------------------------------------
size_t avx512bw_strstr_v3(const std::string& s, const std::string& needle) {
return avx512bw_strstr_v3(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: avx512f-strstr-v2.cpp
================================================
// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html
__mmask16 FORCE_INLINE zero_byte_mask(const __m512i v) {
const __m512i v01 = _mm512_set1_epi8(0x01);
const __m512i v80 = _mm512_set1_epi8(int8_t(0x80));
const __m512i v1 = _mm512_sub_epi32(v, v01);
// tmp1 = (v - 0x01010101) & ~v & 0x80808080
const __m512i tmp1 = _mm512_ternarylogic_epi32(v1, v, v80, 0x20);
return _mm512_test_epi32_mask(tmp1, tmp1);
}
size_t avx512f_strstr_v2_anysize(const char* string, size_t n, const char* needle, size_t k) {
assert(n > 0);
assert(k > 0);
const __m512i first = _mm512_set1_epi8(needle[0]);
const __m512i last = _mm512_set1_epi8(needle[k - 1]);
char* haystack = const_cast<char*>(string);
char* end = haystack + n;
for (/**/; haystack < end; haystack += 64) {
const __m512i block_first = _mm512_loadu_si512(haystack + 0);
const __m512i block_last = _mm512_loadu_si512(haystack + k - 1);
#if 0
const __m512i first_zeros = _mm512_xor_si512(block_first, first);
const __m512i last_zeros = _mm512_xor_si512(block_last, last);
const __m512i zeros = _mm512_or_si512(first_zeros, last_zeros);
#else
const __m512i first_zeros = _mm512_xor_si512(block_first, first);
/*
first_zeros | block_last | last | first_zeros | (block_last ^ last)
------------+------------+------+------------------------------------
0 | 0 | 0 | 0
0 | 0 | 1 | 1
0 | 1 | 0 | 1
0 | 1 | 1 | 0
1 | 0 | 0 | 1
1 | 0 | 1 | 1
1 | 1 | 0 | 1
1 | 1 | 1 | 1
*/
const __m512i zeros = _mm512_ternarylogic_epi32(first_zeros, block_last, last, 0xf6);
#endif
uint32_t mask = zero_byte_mask(zeros);
while (mask) {
const uint64_t p = __builtin_ctz(mask);
if (memcmp(haystack + 4*p + 0, needle, k) == 0) {
return (haystack - string) + 4*p + 0;
}
if (memcmp(haystack + 4*p + 1, needle, k) == 0) {
return (haystack - string) + 4*p + 1;
}
if (memcmp(haystack + 4*p + 2, needle, k) == 0) {
return (haystack - string) + 4*p + 2;
}
if (memcmp(haystack + 4*p + 3, needle, k) == 0) {
return (haystack - string) + 4*p + 3;
}
mask = bits::clear_leftmost_set(mask);
}
}
return size_t(-1);
}
template <size_t k, typename MEMCMP>
size_t avx512f_strstr_v2_memcmp(const char* string, size_t n, const char* needle, MEMCMP memeq_fun) {
assert(n > 0);
assert(k > 0);
const __m512i first = _mm512_set1_epi8(needle[0]);
const __m512i last = _mm512_set1_epi8(needle[k - 1]);
char* haystack = const_cast<char*>(string);
char* end = haystack + n;
for (/**/; haystack < end; haystack += 64) {
const __m512i block_first = _mm512_loadu_si512(haystack + 0);
const __m512i block_last = _mm512_loadu_si512(haystack + k - 1);
const __m512i first_zeros = _mm512_xor_si512(block_first, first);
const __m512i zeros = _mm512_ternarylogic_epi32(first_zeros, block_last, last, 0xf6);
uint32_t mask = zero_byte_mask(zeros);
while (mask) {
const uint64_t p = __builtin_ctz(mask);
if (memeq_fun(haystack + 4*p + 0, needle)) {
return (haystack - string) + 4*p + 0;
}
if (memeq_fun(haystack + 4*p + 1, needle)) {
return (haystack - string) + 4*p + 1;
}
if (memeq_fun(haystack + 4*p + 2, needle)) {
return (haystack - string) + 4*p + 2;
}
if (memeq_fun(haystack + 4*p + 3, needle)) {
return (haystack - string) + 4*p + 3;
}
mask = bits::clear_leftmost_set(mask);
}
}
return size_t(-1);
}
// ------------------------------------------------------------------------
size_t avx512f_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
switch (k) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2:
result = avx512f_strstr_v2_memcmp<2>(s, n, needle, memcmp2);
break;
case 3:
result = avx512f_strstr_v2_memcmp<3>(s, n, needle, memcmp3);
break;
case 4:
result = avx512f_strstr_v2_memcmp<4>(s, n, needle, memcmp4);
break;
case 5:
result = avx512f_strstr_v2_memcmp<5>(s, n, needle, memcmp5);
break;
case 6:
result = avx512f_strstr_v2_memcmp<6>(s, n, needle, memcmp6);
break;
case 7:
result = avx512f_strstr_v2_memcmp<7>(s, n, needle, memcmp7);
break;
case 8:
result = avx512f_strstr_v2_memcmp<8>(s, n, needle, memcmp8);
break;
case 9:
result = avx512f_strstr_v2_memcmp<9>(s, n, needle, memcmp9);
break;
case 10:
result = avx512f_strstr_v2_memcmp<10>(s, n, needle, memcmp10);
break;
case 11:
result = avx512f_strstr_v2_memcmp<11>(s, n, needle, memcmp11);
break;
case 12:
result = avx512f_strstr_v2_memcmp<12>(s, n, needle, memcmp12);
break;
default:
result = avx512f_strstr_v2_anysize(s, n, needle, k);
break;
}
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
// --------------------------------------------------
size_t avx512f_strstr_v2(const std::string& s, const std::string& needle) {
return avx512f_strstr_v2(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: avx512f-strstr.cpp
================================================
/*
string - pointer to the string
n - string length in bytes
needle - pointer to another string
n - needle length in bytes
*/
size_t avx512f_strstr_long(const char* string, size_t n, const char* needle, size_t k) {
assert(n > 0);
assert(k > 4);
__m512i curr;
__m512i next;
__m512i v0, v1, v2, v3;
char* haystack = const_cast<char*>(string);
char* last = haystack + n;
const uint32_t prf = *(uint32_t*)needle; // the first 4 bytes of needle
const __m512i prefix = _mm512_set1_epi32(prf);
next = _mm512_loadu_si512(haystack);
for (/**/; haystack < last; haystack += 64) {
curr = next;
next = _mm512_loadu_si512(haystack + 64);
const __m512i shft = _mm512_alignr_epi32(next, curr, 1);
v0 = curr;
{
const __m512i t1 = _mm512_srli_epi32(curr, 8);
const __m512i t2 = _mm512_slli_epi32(shft, 24);
v1 = _mm512_or_si512(t1, t2);
}
{
const __m512i t1 = _mm512_srli_epi32(curr, 16);
const __m512i t2 = _mm512_slli_epi32(shft, 16);
v2 = _mm512_or_si512(t1, t2);
}
{
const __m512i t1 = _mm512_srli_epi32(curr, 24);
const __m512i t2 = _mm512_slli_epi32(shft, 8);
v3 = _mm512_or_si512(t1, t2);
}
uint16_t m0 = _mm512_cmpeq_epi32_mask(v0, prefix);
uint16_t m1 = _mm512_cmpeq_epi32_mask(v1, prefix);
uint16_t m2 = _mm512_cmpeq_epi32_mask(v2, prefix);
uint16_t m3 = _mm512_cmpeq_epi32_mask(v3, prefix);
int index = 64;
while (m0 | m1 | m2 | m3) {
if (m0) {
int pos = __builtin_ctz(m0) * 4 + 0;
m0 = m0 & (m0 - 1);
if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) {
index = pos;
}
}
if (m1) {
int pos = __builtin_ctz(m1) * 4 + 1;
m1 = m1 & (m1 - 1);
if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) {
index = pos;
}
}
if (m2) {
int pos = __builtin_ctz(m2) * 4 + 2;
m2 = m2 & (m2 - 1);
if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) {
index = pos;
}
}
if (m3) {
int pos = __builtin_ctz(m3) * 4 + 3;
m3 = m3 & (m3 - 1);
if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) {
index = pos;
}
}
}
if (index < 64) {
return (haystack - string) + index;
}
}
return size_t(-1);
}
// ------------------------------------------------------------------------
size_t avx512f_strstr_eq4(const char* string, size_t n, const char* needle) {
assert(n > 0);
__m512i curr;
__m512i next;
__m512i v0, v1, v2, v3;
char* haystack = const_cast<char*>(string);
char* last = haystack + n;
const uint32_t prf = *(uint32_t*)needle; // the first 4 bytes of needle
const __m512i prefix = _mm512_set1_epi32(prf);
next = _mm512_loadu_si512(haystack);
for (/**/; haystack < last; haystack += 64) {
curr = next;
next = _mm512_loadu_si512(haystack + 64);
const __m512i shft = _mm512_alignr_epi32(next, curr, 1);
v0 = curr;
{
const __m512i t1 = _mm512_srli_epi32(curr, 8);
const __m512i t2 = _mm512_slli_epi32(shft, 24);
v1 = _mm512_or_si512(t1, t2);
}
{
const __m512i t1 = _mm512_srli_epi32(curr, 16);
const __m512i t2 = _mm512_slli_epi32(shft, 16);
v2 = _mm512_or_si512(t1, t2);
}
{
const __m512i t1 = _mm512_srli_epi32(curr, 24);
const __m512i t2 = _mm512_slli_epi32(shft, 8);
v3 = _mm512_or_si512(t1, t2);
}
uint16_t m0 = _mm512_cmpeq_epi32_mask(v0, prefix);
uint16_t m1 = _mm512_cmpeq_epi32_mask(v1, prefix);
uint16_t m2 = _mm512_cmpeq_epi32_mask(v2, prefix);
uint16_t m3 = _mm512_cmpeq_epi32_mask(v3, prefix);
int index = 64;
if (m0) {
int pos = __builtin_ctz(m0) * 4 + 0;
if (pos < index) {
index = pos;
}
}
if (m1) {
int pos = __builtin_ctz(m1) * 4 + 1;
if (pos < index) {
index = pos;
}
}
if (m2) {
int pos = __builtin_ctz(m2) * 4 + 2;
if (pos < index) {
index = pos;
}
}
if (m3) {
int pos = __builtin_ctz(m3) * 4 + 3;
if (pos < index) {
index = pos;
}
}
if (index < 64) {
return (haystack - string) + index;
}
assert(m0 == 0 && m1 == 0 && m2 == 0 && m3 == 0);
}
return size_t(-1);
}
// ------------------------------------------------------------------------
size_t avx512f_strstr(const char* s, size_t n, const char* needle, size_t needle_size) {
size_t result = std::string::npos;
if (n < needle_size) {
return result;
}
switch (needle_size) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2:
case 3: {
const char* res = reinterpret_cast<const char*>(strstr(s, needle));
return (res != nullptr) ? res - s : std::string::npos;
}
case 4:
result = avx512f_strstr_eq4(s, n, needle);
break;
default:
result = avx512f_strstr_long(s, n, needle, needle_size);
break;
}
if (result <= n - needle_size) {
return result;
} else {
return std::string::npos;
}
}
// --------------------------------------------------
size_t avx512f_strstr(const std::string& s, const std::string& needle) {
return avx512f_strstr(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: common.h
================================================
#pragma once
#define FORCE_INLINE inline __attribute__((always_inline))
#define MAYBE_UNUSED inline __attribute__((unused))
#if defined(HAVE_NEON_INSTRUCTIONS)
# include <arm_neon.h>
# define USE_SIMPLE_MEMCMP // for fixed-memcmp.cpp
#else
# include <immintrin.h>
#endif
================================================
FILE: data/placeholder
================================================
placeholder
================================================
FILE: fixed-memcmp.cpp
================================================
// #define USE_SIMPLE_MEMCMP // when defined simpler expressions are used
namespace {
MAYBE_UNUSED
bool always_true(const char*, const char*) {
return true;
}
MAYBE_UNUSED
bool memcmp1(const char* a, const char* b) {
return a[0] == b[0];
}
MAYBE_UNUSED
bool memcmp2(const char* a, const char* b) {
const uint16_t A = *reinterpret_cast<const uint16_t*>(a);
const uint16_t B = *reinterpret_cast<const uint16_t*>(b);
return A == B;
}
MAYBE_UNUSED
bool memcmp3(const char* a, const char* b) {
#ifdef USE_SIMPLE_MEMCMP
return memcmp2(a, b) && memcmp1(a + 2, b + 2);
#else
const uint32_t A = *reinterpret_cast<const uint32_t*>(a);
const uint32_t B = *reinterpret_cast<const uint32_t*>(b);
return (A & 0x00ffffff) == (B & 0x00ffffff);
#endif
}
MAYBE_UNUSED
bool memcmp4(const char* a, const char* b) {
const uint32_t A = *reinterpret_cast<const uint32_t*>(a);
const uint32_t B = *reinterpret_cast<const uint32_t*>(b);
return A == B;
}
MAYBE_UNUSED
bool memcmp5(const char* a, const char* b) {
#ifdef USE_SIMPLE_MEMCMP
return memcmp4(a, b) && memcmp1(a + 4, b + 4);
#else
const uint64_t A = *reinterpret_cast<const uint64_t*>(a);
const uint64_t B = *reinterpret_cast<const uint64_t*>(b);
return ((A ^ B) & 0x000000fffffffffflu) == 0;
#endif
}
MAYBE_UNUSED
bool memcmp6(const char* a, const char* b) {
#ifdef USE_SIMPLE_MEMCMP
return memcmp4(a, b) && memcmp2(a + 4, b + 4);
#else
const uint64_t A = *reinterpret_cast<const uint64_t*>(a);
const uint64_t B = *reinterpret_cast<const uint64_t*>(b);
return ((A ^ B) & 0x0000fffffffffffflu) == 0;
#endif
}
MAYBE_UNUSED
bool memcmp7(const char* a, const char* b) {
#ifdef USE_SIMPLE_MEMCMP
return memcmp4(a, b) && memcmp3(a + 4, b + 4);
#else
const uint64_t A = *reinterpret_cast<const uint64_t*>(a);
const uint64_t B = *reinterpret_cast<const uint64_t*>(b);
return ((A ^ B) & 0x00fffffffffffffflu) == 0;
#endif
}
MAYBE_UNUSED
bool memcmp8(const char* a, const char* b) {
const uint64_t A = *reinterpret_cast<const uint64_t*>(a);
const uint64_t B = *reinterpret_cast<const uint64_t*>(b);
return A == B;
}
MAYBE_UNUSED
bool memcmp9(const char* a, const char* b) {
const uint64_t A = *reinterpret_cast<const uint64_t*>(a);
const uint64_t B = *reinterpret_cast<const uint64_t*>(b);
return (A == B) & (a[8] == b[8]);
}
MAYBE_UNUSED
bool memcmp10(const char* a, const char* b) {
const uint64_t Aq = *reinterpret_cast<const uint64_t*>(a);
const uint64_t Bq = *reinterpret_cast<const uint64_t*>(b);
const uint16_t Aw = *reinterpret_cast<const uint16_t*>(a + 8);
const uint16_t Bw = *reinterpret_cast<const uint16_t*>(b + 8);
return (Aq == Bq) & (Aw == Bw);
}
MAYBE_UNUSED
bool memcmp11(const char* a, const char* b) {
#ifdef USE_SIMPLE_MEMCMP
return memcmp8(a, b) && memcmp3(a + 8, b + 8);
#else
const uint64_t Aq = *reinterpret_cast<const uint64_t*>(a);
const uint64_t Bq = *reinterpret_cast<const uint64_t*>(b);
const uint32_t Ad = *reinterpret_cast<const uint32_t*>(a + 8);
const uint32_t Bd = *reinterpret_cast<const uint32_t*>(b + 8);
return (Aq == Bq) & ((Ad & 0x00ffffff) == (Bd & 0x00ffffff));
#endif
}
MAYBE_UNUSED
bool memcmp12(const char* a, const char* b) {
const uint64_t Aq = *reinterpret_cast<const uint64_t*>(a);
const uint64_t Bq = *reinterpret_cast<const uint64_t*>(b);
const uint32_t Ad = *reinterpret_cast<const uint32_t*>(a + 8);
const uint32_t Bd = *reinterpret_cast<const uint32_t*>(b + 8);
return (Aq == Bq) & (Ad == Bd);
}
}
================================================
FILE: make_words.sh
================================================
# split words
cat $1 \
| tr -s -c "a-zA-Z" "\n" \
| sort -u \
> $2
================================================
FILE: neon-strstr-v2.cpp
================================================
size_t FORCE_INLINE neon_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {
assert(k > 0);
assert(n > 0);
const uint8x16_t first = vdupq_n_u8(needle[0]);
const uint8x16_t last = vdupq_n_u8(needle[k - 1]);
const uint8x8_t half = vdup_n_u8(0x0f);
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(s);
union {
uint8_t tmp[8];
uint32_t word[2];
};
for (size_t i = 0; i < n; i += 16) {
const uint8x16_t block_first = vld1q_u8(ptr + i);
const uint8x16_t block_last = vld1q_u8(ptr + i + k - 1);
const uint8x16_t eq_first = vceqq_u8(first, block_first);
const uint8x16_t eq_last = vceqq_u8(last, block_last);
const uint8x16_t pred_16 = vandq_u8(eq_first, eq_last);
const uint8x8_t pred_8 = vbsl_u8(half, vget_low_u8(pred_16), vget_high_u8(pred_16));
vst1_u8(tmp, pred_8);
if ((word[0] | word[1]) == 0) {
continue;
}
#if 0
for (int j=0; j < 8; j++) {
if ((tmp[j] & 0x0f) && (memcmp(s + i + j + 1, needle + 1, k - 2) == 0)) {
return i + j;
}
}
for (int j=0; j < 8; j++) {
if ((tmp[j] & 0xf0) && (memcmp(s + i + j + 1 + 8, needle + 1, k - 2) == 0)) {
return i + j + 8;
}
}
#else
// the above loops unrolled
uint32_t v;
#define RETURN_IF_EQ(MASK, SHIFT) \
if ((v & MASK) && memcmp(s + i + SHIFT + 1, needle + 1, k - 2) == 0) { \
return i + SHIFT; \
}
#define COMPARE(MASK, WORD_IDX, SHIFT) \
v = word[WORD_IDX]; \
RETURN_IF_EQ(MASK, SHIFT + 0); \
v >>= 8; \
RETURN_IF_EQ(MASK, SHIFT + 1); \
v >>= 8; \
RETURN_IF_EQ(MASK, SHIFT + 2); \
v >>= 8; \
RETURN_IF_EQ(MASK, SHIFT + 3);
COMPARE(0x0f, 0, 0);
COMPARE(0x0f, 1, 4);
COMPARE(0xf0, 0, 8);
COMPARE(0xf0, 1, 12);
#undef RETURN_IF_EQ
#undef COMPARE
#endif
}
return std::string::npos;
}
// ------------------------------------------------------------------------
template <size_t k, typename MEMCMP>
size_t FORCE_INLINE neon_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {
assert(k > 0);
assert(n > 0);
const uint8x16_t first = vdupq_n_u8(needle[0]);
const uint8x16_t last = vdupq_n_u8(needle[k - 1]);
const uint8x8_t half = vdup_n_u8(0x0f);
const uint8_t* ptr = reinterpret_cast<const uint8_t*>(s);
union {
uint8_t tmp[8];
uint32_t word[2];
};
for (size_t i = 0; i < n; i += 16) {
const uint8x16_t block_first = vld1q_u8(ptr + i);
const uint8x16_t block_last = vld1q_u8(ptr + i + k - 1);
const uint8x16_t eq_first = vceqq_u8(first, block_first);
const uint8x16_t eq_last = vceqq_u8(last, block_last);
const uint8x16_t pred_16 = vandq_u8(eq_first, eq_last);
const uint8x8_t pred_8 = vbsl_u8(half, vget_low_u8(pred_16), vget_high_u8(pred_16));
vst1_u8(tmp, pred_8);
if ((word[0] | word[1]) == 0) {
continue;
}
#if 0
for (int j=0; j < 8; j++) {
if ((tmp[j] & 0x0f) && memcmp_fun(s + i + j + 1, needle + 1)) {
return i + j;
}
}
for (int j=0; j < 8; j++) {
if ((tmp[j] & 0xf0) && memcmp_fun(s + i + j + 1 + 8, needle + 1)) {
return i + j + 8;
}
}
#else
// the above loops unrolled
uint32_t v;
#define RETURN_IF_EQ(MASK, SHIFT) \
if ((v & MASK) && memcmp_fun(s + i + SHIFT + 1, needle + 1)) { \
return i + SHIFT; \
}
#define COMPARE(MASK, WORD_IDX, SHIFT) \
v = word[WORD_IDX]; \
RETURN_IF_EQ(MASK, SHIFT + 0); \
v >>= 8; \
RETURN_IF_EQ(MASK, SHIFT + 1); \
v >>= 8; \
RETURN_IF_EQ(MASK, SHIFT + 2); \
v >>= 8; \
RETURN_IF_EQ(MASK, SHIFT + 3);
COMPARE(0x0f, 0, 0);
COMPARE(0x0f, 1, 4);
COMPARE(0xf0, 0, 8);
COMPARE(0xf0, 1, 12);
#undef RETURN_IF_EQ
#undef COMPARE
#endif
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t neon_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
switch (k) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2:
result = neon_strstr_memcmp<2>(s, n, needle, always_true);
break;
case 3:
result = neon_strstr_memcmp<3>(s, n, needle, memcmp1);
break;
case 4:
result = neon_strstr_memcmp<4>(s, n, needle, memcmp2);
break;
case 5:
result = neon_strstr_memcmp<5>(s, n, needle, memcmp4);
break;
case 6:
result = neon_strstr_memcmp<6>(s, n, needle, memcmp4);
break;
case 7:
result = neon_strstr_memcmp<7>(s, n, needle, memcmp5);
break;
case 8:
result = neon_strstr_memcmp<8>(s, n, needle, memcmp6);
break;
case 9:
result = neon_strstr_memcmp<9>(s, n, needle, memcmp8);
break;
case 10:
result = neon_strstr_memcmp<10>(s, n, needle, memcmp8);
break;
case 11:
result = neon_strstr_memcmp<11>(s, n, needle, memcmp9);
break;
case 12:
result = neon_strstr_memcmp<12>(s, n, needle, memcmp10);
break;
default:
result = neon_strstr_anysize(s, n, needle, k);
break;
}
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
// ------------------------------------------------------------------------
size_t neon_strstr_v2(const std::string& s, const std::string& needle) {
return neon_strstr_v2(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: original/sse4_strstr-test.py
================================================
import sys, os, random
filename = "<unspecified>"
try:
filename = sys.argv[1]
string = open(filename, "r").read()
except:
print "can't open '%s'" % filename
sys.exit(1)
try:
random.seed(int(sys.argv[3]))
except:
pass
def time_command(command):
os.system('/usr/bin/time -o /tmp/measure -f "%U" ' + command)
f = open("/tmp/measure", "r")
t = float(f.read())
f.close()
return t
def time(command1, command2, iters=10):
while True:
t1 = time_command(command1.replace("__iters__", str(iters)))
if t1 > 1:
t2 = time_command(command2.replace("__iters__", str(iters)))
return iters, t1, t2
else:
iters *= 10
def compare(filename, wordpos, word, wordlen):
word = word.replace("%", "%%")
cmd1 = './a.out "%s" libc __iters__ "%s" > /dev/null' % (filename, word)
cmd2 = './a.out "%s" sse4 __iters__ "%s" > /dev/null' % (filename, word)
_, t1, t2 = time(cmd1, cmd2)
return "[%d,%d] libc=%0.3fs sse4=%0.3fs speedup=%0.2f" % (wordpos, wordlen, t1, t2, t1/t2)
logname = "sse4.log"
lognumber = 1
while True:
if not os.path.exists(logname):
log = open(logname, "w")
break
else:
logname = "sse4%d.log" % lognumber
lognumber += 1
try:
for n in xrange(4, 64):
i1 = random.randint( 0, 64)
i2 = random.randint( 65, 1024)
i3 = random.randint(1024, len(string)-n)
print "length", n
for i in [i1, i2, i3]:
word = string[i:i+n]
for c in "\\`()<>{}\"":
word = word.replace(c, "\\" + c)
cmd = './a.out "%s" verify 1 "%s"' % (filename, word)
err = os.system(cmd)
if err:
print repr(string[i:i+l])
sys.exit(1)
else:
s = compare(filename, i, word, n)
log.write(s + "\n")
print s
except:
import traceback
traceback.print_exc()
log.close()
================================================
FILE: original/sse4_strstr.c
================================================
/*
SSE4 string search --- modification of Karp-Rabin algorithm, $Revision: 1.11 $
Acceleration of strstr using SSE4 instruction MPSADBW.
This program includes one wrapper sse4_strstr around
following functions:
* sse4_strstr_any - exact comparison is done with built-in
function strncmp.c
* sse4_strstr_len3, see4_strstr_len4 - optimized
for substring of length 3 and 4 chars, no additional comparison
is needed
* sse4_strstr_max20, sse4_strstr_max36 - optimized
for substring of length 4..20 and 20..36, exact comparision
is done with few assebler instructions
Author: Wojciech Mua
e-mail: wojciech_mula@poczta.onet.pl
www: http://0x80.pl/
License: BSD
initial release 27-05-2008, last update $Date: 2008-06-08 23:00:44 $
*/
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <strings.h>
static uint8_t mask[][16] = {
{0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},
{0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},
{0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},
{0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},
{0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},
{0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},
{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},
{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},
{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00},
{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00},
{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00},
{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00},
{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00},
{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00},
{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00},
{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff},
};
char* sse4_strstr_any(char* s1, int n1, char* s2, int n2);
char* sse4_strstr_len3(char* s1, int n1, char* s2, int n2);
char* sse4_strstr_len4(char* s1, int n1, char* s2, int n2);
char* sse4_strstr_max20(char* s1, int n1, char* s2, int n2);
char* sse4_strstr_max36(char* s1, int n1, char* s2, int n2);
char* sse4_strstr(char* s1, int n1, char* s2, int n2) {
switch (n1) {
case 0:
return NULL;
case 1:
return strchr(s2, s1[1]);
case 2:
return strstr(s2, s1);
case 3:
return sse4_strstr_len3(s1, n1, s2, n2);
case 4:
return sse4_strstr_len4(s1, n1, s2, n2);
case 5: case 6: case 7: case 8: case 9:
case 10: case 11: case 12: case 13: case 14:
case 15: case 16: case 17: case 18: case 19:
case 20: /* 5..20 */
return sse4_strstr_max20(s1, n1, s2, n2);
case 21: case 22: case 23: case 24: case 25:
case 26: case 27: case 28: case 29: case 30:
case 31: case 32: case 33: case 34: case 35:
case 36: /* 21..36 */
return sse4_strstr_max36(s1, n1, s2, n2);
default:
return sse4_strstr_any(s1, n1, s2, n2);
}
}
char* sse4_strstr_any(char* s1, int n1, char* s2, int n2) {
// n1 > 4, n2 > 4
char* result;
uint32_t dummy __attribute__((unused));
__asm__ volatile ("movdqu (%%eax), %%xmm1" : : "a" (s1));
__asm__ volatile ("pxor %%xmm0, %%xmm0" : : );
__asm__ volatile (
/*** initialization ****************************************************/
// we have to save 3 registers: eax, ecx and edx
// also strncmp needs three arguments, thus esp -= (3+3)*4 =
" addl $-24, %%esp \n"
// function strncmp is invoke with argument s1+4, s2+4, n1-4 -- s1+4 and
// n1-4 are constant across all iterations, thus stack frame
// can be partially initialize:
" movl 8(%%ebp), %%eax \n"
" addl $4, %%eax \n"
" movl %%eax, 0(%%esp) \n" // s1+4
" \n"
" movl 12(%%ebp), %%eax \n"
" subl $4, %%eax \n"
" movl %%eax, 8(%%esp) \n" // n1-4
" \n"
/*** main loop *********************************************************/
"0: \n"
// load 16 bytes, we consider just 8+3 chars at the beggining
" movdqu (%%esi), %%xmm2 \n"
" addl $8, %%esi \n" // advance pointer: s1 += 8
// xmm2 - vector of L1 distances between s1's 4-byte prefix
// and sequence of eight 4-byte subvectors from xmm2
" mpsadbw $0, %%xmm1, %%xmm2 \n"
// xmm2 - word become 0xffff if L1=0, 0x0000 otherwise
" pcmpeqw %%xmm0, %%xmm2 \n"
// any L1=0? if no, skip comparision inner loop
" ptest %%xmm2, %%xmm0 \n"
" jc 1f \n"
/*** inner loop ************************************************/
// comparision inner loop: convert word mask to bitmask
" pmovmskb %%xmm2, %%edx \n"
// we are interested in **word** indexes
" andl $0b0101010101010101, %%edx \n"
" 2: \n"
" bsf %%edx, %%eax \n" // get next bit position
" jz 1f \n" // no bit set? exit loop
" \n"
" btr %%eax, %%edx \n" // unset bit
" shr $1, %%eax \n" // divide position by 2
// save registers before invoke strncmp
" movl %%eax, 12(%%esp) \n"
" movl %%ecx, 16(%%esp) \n"
" movl %%edx, 20(%%esp) \n"
// update function argument
" leal -4(%%esi, %%eax), %%eax \n"
" movl %%eax, 4(%%esp) \n" // s2+4
// invoke strncmp(s1+4, s2+4, n1-4)
" call strncmp \n"
" test %%eax, %%eax \n" // result == 0?
// restore registers
" movl 12(%%esp), %%eax \n"
" movl 16(%%esp), %%ecx \n"
" movl 20(%%esp), %%edx \n"
" jnz 2b \n"
" leal -8(%%eax, %%esi), %%eax \n" // eax -- address
" jmp 4f \n" // of s1's first occurance
/*** main loop prologue ************************************************/
"1: \n"
" subl $8, %%ecx \n"
" cmpl $0, %%ecx \n"
" jg 0b \n"
" xorl %%eax, %%eax \n" // s1 not found, return NULL
"4: \n"
" addl $24, %%esp \n" // and finally restore stack frame
: "=a" (result),
"=S" (dummy),
"=c" (dummy)
: "S" (s2),
"c" (n2-n1)
);
return result;
}
char* sse4_strstr_max20(char* s1, int n1, char* s2, int n2) {
// 4 <= n1 <= 20, n2 > 4
uint32_t dummy __attribute__((unused));
char* result;
__asm__ volatile ("movdqu (%%eax), %%xmm6" : : "a" (mask[n1-5]));
__asm__ volatile ("movdqu (%%eax), %%xmm1" : : "a" (s1));
__asm__ volatile ("movdqu (%%eax), %%xmm2" : : "a" (s1+4)); // xmm2 -- s1 suffix
__asm__ volatile ("pxor %%xmm0, %%xmm0" : : );
__asm__ volatile (
/*** main loop *********************************************************/
"0: \n"
// load 16 bytes, MPSADBW consider just 8+3 chars at the beggining
" movdqu (%%esi), %%xmm7 \n"
" addl $8, %%esi \n" // advance pointer: s1 += 8
// xmm2 - vector of L1 distances between s1's 4-byte prefix
// and sequence of eight 4-byte subvectors from xmm2
" mpsadbw $0, %%xmm1, %%xmm7 \n"
// xmm2 - word become 0xffff if L1=0, 0x0000 otherwise
" pcmpeqw %%xmm0, %%xmm7 \n"
// any L1=0? if no, skip comparision inner loop
" ptest %%xmm7, %%xmm0 \n"
" jc 1f \n"
/*** inner loop ************************************************/
// comparision inner loop: convert word mask to bitmask
" pmovmskb %%xmm7, %%edx \n"
// we are interested in **word** positions
" andl $0b0101010101010101, %%edx \n"
" 2: \n"
" bsf %%edx, %%eax \n" // get next bit position
" jz 1f \n" // no bit set? exit loop
" \n"
" btr %%eax, %%edx \n" // unset bit
" shr $1, %%eax \n" // divide position by 2
" movdqu -4(%%esi, %%eax), %%xmm7 \n"
" pcmpeqb %%xmm2, %%xmm7 \n"
" ptest %%xmm6, %%xmm7 \n"
" jnc 2b \n"
" leal -8(%%eax, %%esi), %%eax \n" // eax -- address
" jmp 4f \n" // of s1's first occurance
/*** main loop prologue ************************************************/
"1: \n"
" subl $8, %%ecx \n"
" cmpl $0, %%ecx \n"
" jg 0b \n"
" xorl %%eax, %%eax \n" // s1 not found, return NULL
"4: \n"
: "=a" (result),
"=S" (dummy),
"=c" (dummy)
: "S" (s2),
"c" (n2-n1)
);
return result;
}
char* sse4_strstr_max36(char* s1, int n1, char* s2, int n2) {
// 20 <= n1 <= 36, n2 > 4
uint32_t dummy __attribute__((unused));
char* result;
__asm__ volatile ("movdqu (%%eax), %%xmm1" : : "a" (s1));
__asm__ volatile ("movdqu (%%eax), %%xmm2" : : "a" (s1+4)); // xmm2 - s1[4:20]
__asm__ volatile ("movdqu (%%eax), %%xmm3" : : "a" (s1+4+16)); // xmm3 - s1[20:] (suffix)
__asm__ volatile ("movdqu (%%eax), %%xmm6" : : "a" (mask[n1-5-16]));
__asm__ volatile ("pand %%xmm6, %%xmm3" : : );
__asm__ volatile ("pxor %%xmm0, %%xmm0" : : ); // packed_byte(0x00)
__asm__ volatile ("pcmpeqb %%xmm5, %%xmm5" : : ); // packed_byte(0xff)
__asm__ volatile (
/*** main loop *********************************************************/
"0: \n"
// load 16 bytes, MPSADBW consider just 8+3 chars at the beggining
" movdqu (%%esi), %%xmm7 \n"
" addl $8, %%esi \n" // advance pointer: s1 += 8
// xmm2 - vector of L1 distances between s1's 4-byte prefix
// and sequence of eight 4-byte subvectors from xmm2
" mpsadbw $0, %%xmm1, %%xmm7 \n"
// xmm2 - word become 0xffff if L1=0, 0x0000 otherwise
" pcmpeqw %%xmm0, %%xmm7 \n"
// any L1=0? if no, skip comparision inner loop
" ptest %%xmm7, %%xmm0 \n"
" jc 1f \n"
/*** inner loop ************************************************/
// comparision inner loop: convert word mask to bitmask
" pmovmskb %%xmm7, %%edx \n"
// we are interested in **word** positions
" andl $0b0101010101010101, %%edx \n"
" 2: \n"
" bsf %%edx, %%eax \n" // get next bit position
" jz 1f \n" // no bit set? exit loop
" \n"
" btr %%eax, %%edx \n" // unset bit
" shr $1, %%eax \n" // divide position by 2
" movdqu -4(%%esi, %%eax), %%xmm7 \n"
" movdqu 12(%%esi, %%eax), %%xmm4 \n"
" pand %%xmm6, %%xmm4 \n"
" pcmpeqb %%xmm2, %%xmm7 \n"
" pcmpeqb %%xmm3, %%xmm4 \n"
" pand %%xmm7, %%xmm4 \n"
" ptest %%xmm5, %%xmm4 \n"
" jnc 2b \n"
" leal -8(%%eax, %%esi), %%eax \n" // eax -- address
" jmp 4f \n" // of s1's first occurance
/*** main loop prologue ************************************************/
"1: \n"
" subl $8, %%ecx \n"
" cmpl $0, %%ecx \n"
" jg 0b \n"
" xorl %%eax, %%eax \n" // s1 not found, return NULL
"4: \n"
: "=a" (result),
"=S" (dummy),
"=c" (dummy)
: "S" (s2),
"c" (n2-n1)
);
return result;
}
char* sse4_strstr_len4(char* s1, int n1, char* s2, int n2) {
// n1 == 4, n2 > 4
uint32_t dummy __attribute__((unused));
char* result;
__asm__ volatile ("movdqu (%%eax), %%xmm1" : : "a" (s1));
__asm__ volatile ("pxor %%xmm0, %%xmm0" : : );
__asm__ volatile (
/*** main loop *********************************************************/
"0: \n"
// load 16 bytes, we consider just 8+3 chars at the beggining
" movdqu (%%esi), %%xmm2 \n"
" addl $8, %%esi \n" // advance pointer: s1 += 8
// xmm2 - vector of L1 distances between s1's 4-byte prefix
// and sequence of eight 4-byte subvectors from xmm2
" mpsadbw $0, %%xmm1, %%xmm2 \n"
// xmm2 - word become 0xffff if L1=0, 0x0000 otherwise
" pcmpeqw %%xmm0, %%xmm2 \n"
// any L1=0? if no, skip comparision inner loop
" ptest %%xmm2, %%xmm0 \n"
" jnc 1f \n"
" subl $8, %%ecx \n"
" cmpl $0, %%ecx \n"
" jg 0b \n"
" xorl %%eax, %%eax \n" // s1 not found, return NULL
" jmp 2f \n"
"1: \n"
" pmovmskb %%xmm2, %%eax \n"
" bsfl %%eax, %%eax \n"
" shrl $1, %%eax \n"
" lea -8(%%esi, %%eax), %%eax \n"
"2: \n"
: "=a" (result),
"=S" (dummy),
"=c" (dummy)
: "S" (s2),
"c" (n2-n1)
);
return result;
}
char* sse4_strstr_len3(char* s1, int n1, char* s2, int n2) {
// n1 == 4, n2 > 4
uint32_t dummy __attribute__((unused));
char* result;
__asm__ volatile ("movdqu (%%eax), %%xmm1" : : "a" (s1));
__asm__ volatile ("pxor %%xmm0, %%xmm0" : : );
__asm__ volatile (
/*** main loop *********************************************************/
"0: \n"
// load 16 bytes, we consider just 8+3 chars at the beggining
" movdqu (%%esi), %%xmm2 \n"
" addl $8, %%esi \n" // advance pointer: s1 += 8
" movdqa %%xmm2, %%xmm3 \n"
" psrldq $3, %%xmm3 \n"
" pmovzxbw %%xmm3, %%xmm3 \n"
// xmm2 - vector of L1 distances between s1's 4-byte prefix
// and sequence of eight 4-byte subvectors from xmm2
" mpsadbw $0, %%xmm1, %%xmm2 \n"
" psubw %%xmm3, %%xmm2 \n"
// xmm2 - word become 0xffff if L1=0, 0x0000 otherwise
" pcmpeqw %%xmm0, %%xmm2 \n"
// any L1=0? if no, skip comparision inner loop
" ptest %%xmm2, %%xmm0 \n"
" jnc 1f \n"
" subl $8, %%ecx \n"
" cmpl $0, %%ecx \n"
" jg 0b \n"
" xorl %%eax, %%eax \n" // s1 not found, return NULL
" jmp 2f \n"
"1: \n"
" pmovmskb %%xmm2, %%eax \n"
" bsfl %%eax, %%eax \n"
" shrl $1, %%eax \n"
" lea -8(%%esi, %%eax), %%eax \n"
"2: \n"
: "=a" (result),
"=S" (dummy),
"=c" (dummy)
: "S" (s2),
"c" (n2-n1)
);
return result;
}
// sample
uint8_t buffer[1024*500 + 1];
void help() {
puts("prog file sse4|libc|verify iter-count string");
puts("* iter-count > 0");
exit(1);
}
int main(int argc, char* argv[]) {
FILE* f;
int i;
int size;
if (argc != 5)
help();
f = fopen(argv[1], "r");
if (!f) {
printf("can't open '%s'\n", argv[1]);
return 2;
}
size = fread(buffer, 1, sizeof(buffer), f);
buffer[size] = 0;
fclose(f);
int fun = -1, iters, n1;
char* s1;
if (strcasecmp("sse4", argv[2]) == 0)
fun = 0;
else
if (strcasecmp("libc", argv[2]) == 0)
fun = 1;
else
if (strcasecmp("verify", argv[2]) == 0)
fun = 2;
else
help();
if (atoi(argv[3]) <= 0 && (fun != 2))
help();
else
iters = atoi(argv[3]);
s1 = argv[4];
n1 = strlen(s1);
if ((n1 < 3))
help();
else
printf("s1(%d)='%s' s2(%d)\n", n1, s1, size);
char* r1;
char* r2;
switch (fun) {
case 0:
puts("SSE4");
for (i=0; i < iters; i++)
sse4_strstr(s1, n1, (char*)buffer, size);
break;
case 1:
puts("Lib C");
for (i=0; i < iters; i++) {
//(unsigned int)strstr((char*)buffer, s1);
__asm__ volatile (
"movl $buffer, (%%esp)\n"
"movl %0, 4(%%esp)\n"
"call strstr\n"
:
: "r" (s1)
: "eax", "ecx", "edx"
);
}
break;
case 2:
puts("verify");
r1 = strstr((char*)buffer, s1);
r2 = sse4_strstr(s1, n1, (char*)buffer, size);
printf("LibC = %u\n", (unsigned int)r1);
printf("SSE4 = %u %s\n",
(unsigned int)r2,
(r1 != r2) ? "FAILED!!!" : "ok"
);
if (r1 != r2)
return 1;
}
return 0;
}
// eof
================================================
FILE: results/armv7-32bit-gcc4.9.2.txt
================================================
./speedup_arm data/i386.txt data/words 1
std::strstr ... reference result = 810807651, time = 7.318775 s
std::string::find ... reference result = 810807651, time = 4.171311 s
SWAR 32-bit (generic) ... reference result = 810807651, time = 2.450585 s
ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.299383 s
./speedup_arm data/i386.txt data/words 1
std::strstr ... reference result = 810807651, time = 7.329223 s
std::string::find ... reference result = 810807651, time = 4.188313 s
SWAR 32-bit (generic) ... reference result = 810807651, time = 2.461333 s
ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.305622 s
./speedup_arm data/i386.txt data/words 1
std::strstr ... reference result = 810807651, time = 7.304049 s
std::string::find ... reference result = 810807651, time = 4.172608 s
SWAR 32-bit (generic) ... reference result = 810807651, time = 2.451913 s
ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.300619 s
./speedup_arm data/i386.txt data/words 1
std::strstr ... reference result = 810807651, time = 7.307621 s
std::string::find ... reference result = 810807651, time = 4.176439 s
SWAR 32-bit (generic) ... reference result = 810807651, time = 2.451030 s
ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.299274 s
./speedup_arm data/i386.txt data/words 1
std::strstr ... reference result = 810807651, time = 7.313498 s
std::string::find ... reference result = 810807651, time = 4.175714 s
SWAR 32-bit (generic) ... reference result = 810807651, time = 2.451439 s
ARM Neon 32 bit (v2) ... reference result = 810807651, time = 1.298613 s
================================================
FILE: results/armv8-64bit-clang3.8.0.txt
================================================
std::strstr ... reference result = 810807651, time = 3.457578 s
std::string::find ... reference result = 810807651, time = 1.821379 s
SWAR 64-bit (generic) ... reference result = 810807651, time = 0.463006 s
SWAR 32-bit (generic) ... reference result = 810807651, time = 0.810749 s
ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.407214 s
AArch64 64 bit (v2) ... reference result = 810807651, time = 0.279203 s
std::strstr ... reference result = 810807651, time = 3.381364 s
std::string::find ... reference result = 810807651, time = 1.813678 s
SWAR 64-bit (generic) ... reference result = 810807651, time = 0.462694 s
SWAR 32-bit (generic) ... reference result = 810807651, time = 0.810882 s
ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.406888 s
AArch64 64 bit (v2) ... reference result = 810807651, time = 0.278970 s
std::strstr ... reference result = 810807651, time = 4.118293 s
std::string::find ... reference result = 810807651, time = 1.822696 s
SWAR 64-bit (generic) ... reference result = 810807651, time = 0.463028 s
SWAR 32-bit (generic) ... reference result = 810807651, time = 0.810933 s
ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.407296 s
AArch64 64 bit (v2) ... reference result = 810807651, time = 0.279606 s
std::strstr ... reference result = 810807651, time = 3.375462 s
std::string::find ... reference result = 810807651, time = 1.821449 s
SWAR 64-bit (generic) ... reference result = 810807651, time = 0.462863 s
SWAR 32-bit (generic) ... reference result = 810807651, time = 0.811320 s
ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.407274 s
AArch64 64 bit (v2) ... reference result = 810807651, time = 0.279285 s
std::strstr ... reference result = 810807651, time = 3.378566 s
std::string::find ... reference result = 810807651, time = 1.825054 s
SWAR 64-bit (generic) ... reference result = 810807651, time = 0.462957 s
SWAR 32-bit (generic) ... reference result = 810807651, time = 0.811188 s
ARM Neon 32 bit (v2) ... reference result = 810807651, time = 0.407364 s
AArch64 64 bit (v2) ... reference result = 810807651, time = 0.279490 s
================================================
FILE: results/bulldozer-fx-8510-gcc4.8.4-sse.txt
================================================
./speedup data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 9.390892 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.938355 s
SSE2 (generic) ... reference result = 8108076510, time = 0.788781 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.989833 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 2.060081 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 2.006810 s
./speedup data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 9.387153 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.948608 s
SSE2 (generic) ... reference result = 8108076510, time = 0.789325 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.988635 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 2.066327 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 2.007233 s
./speedup data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 9.377923 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.967027 s
SSE2 (generic) ... reference result = 8108076510, time = 0.788709 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.989077 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 2.065608 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 2.007228 s
================================================
FILE: results/cascadelake-Gold-5217-gcc-7.4.0-avx512bw.txt
================================================
./speedup_avx512bw data/i386.txt data/words
scalar (naive) ... reference result = 8108076510, time = 4.095307 s
std::strstr ... reference result = 8108076510, time = 0.492459 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.243510 s
SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.349437 s
SSE2 (generic) ... reference result = 8108076510, time = 0.443313 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.583372 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.822263 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.311350 s
SSE (naive) ... reference result = 8108076510, time = 1.757493 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.531920 s
AVX2 (generic) ... reference result = 8108076510, time = 0.338738 s
AVX2 (naive) ... reference result = 8108076510, time = 1.013489 s
AVX2-wide (naive) ... reference result = 8107771150, time = 0.480182 s
AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.634909 s
AVX512F (generic) ... reference result = 8108076510, time = 0.281276 s
AVX512BW (generic) ... reference result = 8108076510, time = 0.256798 s
./speedup_avx512bw data/i386.txt data/words
scalar (naive) ... reference result = 8108076510, time = 4.089051 s
std::strstr ... reference result = 8108076510, time = 0.492275 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.243637 s
SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.343078 s
SSE2 (generic) ... reference result = 8108076510, time = 0.443659 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.584467 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.822993 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.313485 s
SSE (naive) ... reference result = 8108076510, time = 1.760697 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.531827 s
AVX2 (generic) ... reference result = 8108076510, time = 0.338912 s
AVX2 (naive) ... reference result = 8108076510, time = 1.012637 s
AVX2-wide (naive) ... reference result = 8107771150, time = 0.478455 s
AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.636537 s
AVX512F (generic) ... reference result = 8108076510, time = 0.279054 s
AVX512BW (generic) ... reference result = 8108076510, time = 0.255777 s
./speedup_avx512bw data/i386.txt data/words
scalar (naive) ... reference result = 8108076510, time = 4.092489 s
std::strstr ... reference result = 8108076510, time = 0.489993 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.241418 s
SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.346954 s
SSE2 (generic) ... reference result = 8108076510, time = 0.442109 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.583955 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.822657 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.312243 s
SSE (naive) ... reference result = 8108076510, time = 1.757719 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.532528 s
AVX2 (generic) ... reference result = 8108076510, time = 0.338666 s
AVX2 (naive) ... reference result = 8108076510, time = 1.013151 s
AVX2-wide (naive) ... reference result = 8107771150, time = 0.477202 s
AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.634753 s
AVX512F (generic) ... reference result = 8108076510, time = 0.280525 s
AVX512BW (generic) ... reference result = 8108076510, time = 0.256838 s
================================================
FILE: results/haswell-i7-4770-gcc5.4.1-avx2.txt
================================================
./speedup_avx2 data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 0.528137 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.605520 s
SSE2 (generic) ... reference result = 8108076510, time = 0.554532 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.897859 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.996473 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.559956 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.615836 s
AVX2 (generic) ... reference result = 8108076510, time = 0.386747 s
./speedup_avx2 data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 0.527864 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.577149 s
SSE2 (generic) ... reference result = 8108076510, time = 0.554352 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.897752 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.996771 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.560012 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.615825 s
AVX2 (generic) ... reference result = 8108076510, time = 0.386528 s
./speedup_avx2 data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 0.528205 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.591732 s
SSE2 (generic) ... reference result = 8108076510, time = 0.554423 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.897921 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.996889 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.559919 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.615783 s
AVX2 (generic) ... reference result = 8108076510, time = 0.386609 s
================================================
FILE: results/knights-landing-7210-gcc5.3.0-avx512f.txt
================================================
./speedup_avx512 data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 4.964439 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 8.205818 s
SSE2 (generic) ... reference result = 8108076510, time = 6.126381 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 18.737857 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 13.745691 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 6.306659 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 13.179747 s
AVX2 (generic) ... reference result = 8108076510, time = 4.113571 s
AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 2.348848 s
AVX512F (generic) ... reference result = 8108076510, time = 1.164081 s
./speedup_avx512 data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 4.946063 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 8.172884 s
SSE2 (generic) ... reference result = 8108076510, time = 6.107860 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 18.717146 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 13.724856 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 6.288685 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 13.151361 s
AVX2 (generic) ... reference result = 8108076510, time = 4.094781 s
AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 2.327864 s
AVX512F (generic) ... reference result = 8108076510, time = 1.142747 s
./speedup_avx512 data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 4.949234 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 8.170751 s
SSE2 (generic) ... reference result = 8108076510, time = 6.109035 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 18.716665 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 13.727568 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 6.289994 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 13.153943 s
AVX2 (generic) ... reference result = 8108076510, time = 4.094941 s
AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 2.326156 s
AVX512F (generic) ... reference result = 8108076510, time = 1.140567 s
================================================
FILE: results/postprocess.py
================================================
from collections import OrderedDict
def load(file):
D = OrderedDict()
for line in file:
if 'reference result' not in line:
continue
name, tail = line.split('...')
name = name.strip()
time = float(tail.split()[6])
if name not in D:
D[name] = time
else:
D[name] = min(time, D[name])
return D
def main():
import sys
paths = sys.argv[1:]
for path in paths:
if len(paths) > 1:
print path
with open(path, 'rt') as f:
for name, time in load(f).iteritems():
print '%-30s %10.5f' % (name, time)
if __name__ == '__main__':
main()
================================================
FILE: results/skylake-i7-6700-gcc5.4.1-avx2.txt
================================================
./speedup_avx2 data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 0.662049 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.404260 s
SSE2 (generic) ... reference result = 8108076510, time = 0.489281 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.638782 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.879433 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.390802 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.570455 s
AVX2 (generic) ... reference result = 8108076510, time = 0.363694 s
./speedup_avx2 data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 0.662266 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.404036 s
SSE2 (generic) ... reference result = 8108076510, time = 0.489313 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.638926 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.879193 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.390626 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.569980 s
AVX2 (generic) ... reference result = 8108076510, time = 0.363876 s
./speedup_avx2 data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 0.661478 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.405280 s
SSE2 (generic) ... reference result = 8108076510, time = 0.488631 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.638753 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.879345 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.390670 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.569808 s
AVX2 (generic) ... reference result = 8108076510, time = 0.363091 s
================================================
FILE: results/skylake-i9-7900-gcc-5.4.1-avx512bw.txt
================================================
./speedup_avx512bw data/i386.txt data/words
naive scalar ... reference result = 8108076510, time = 4.872957 s
std::strstr ... reference result = 8108076510, time = 0.401080 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.237922 s
SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.044511 s
SSE2 (generic) ... reference result = 8108076510, time = 0.385573 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.580510 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.674341 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.058753 s
SSE (naive) ... reference result = 8108076510, time = 1.709206 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.444774 s
AVX2 (generic) ... reference result = 8108076510, time = 0.274761 s
AVX2 (naive) ... reference result = 8108076510, time = 0.918683 s
AVX2 (naive unrolled) ... reference result = 8108076510, time = 0.463246 s
AVX2-wide (naive) ... reference result = 8107771150, time = 0.441233 s
AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.507046 s
AVX512F (generic) ... reference result = 8108076510, time = 0.262774 s
AVX512BW (generic) ... reference result = 8108076510, time = 0.220457 s
./speedup_avx512bw data/i386.txt data/words
naive scalar ... reference result = 8108076510, time = 4.816247 s
std::strstr ... reference result = 8108076510, time = 0.398468 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.239442 s
SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.050195 s
SSE2 (generic) ... reference result = 8108076510, time = 0.384561 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.582862 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.675480 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.061598 s
SSE (naive) ... reference result = 8108076510, time = 1.676643 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.439711 s
AVX2 (generic) ... reference result = 8108076510, time = 1.638515 s
AVX2 (naive) ... reference result = 8108076510, time = 0.984768 s
AVX2 (naive unrolled) ... reference result = 8108076510, time = 0.494318 s
AVX2-wide (naive) ... reference result = 8107771150, time = 0.479306 s
AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.553042 s
AVX512F (generic) ... reference result = 8108076510, time = 0.290909 s
AVX512BW (generic) ... reference result = 8108076510, time = 0.237055 s
./speedup_avx512bw data/i386.txt data/words
naive scalar ... reference result = 8108076510, time = 6.406914 s
std::strstr ... reference result = 8108076510, time = 0.401352 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 1.237499 s
SWAR 32-bit (generic) ... reference result = 8108076510, time = 2.043457 s
SSE2 (generic) ... reference result = 8108076510, time = 0.385167 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 0.581361 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 0.675044 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.059933 s
SSE (naive) ... reference result = 8108076510, time = 1.671910 s
AVX2 (MPSADBW) ... reference result = 8108076510, time = 0.444940 s
AVX2 (generic) ... reference result = 8108076510, time = 0.276522 s
AVX2 (naive) ... reference result = 8108076510, time = 0.921444 s
AVX2 (naive unrolled) ... reference result = 8108076510, time = 0.464818 s
AVX2-wide (naive) ... reference result = 8107771150, time = 0.442211 s
AVX512F (MPSADBW-like) ... reference result = 8108076510, time = 0.511326 s
AVX512F (generic) ... reference result = 8108076510, time = 0.265488 s
AVX512BW (generic) ... reference result = 8108076510, time = 0.221329 s
================================================
FILE: results/westmere-m540-gcc6.2.0-sse4.txt
================================================
./speedup data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 0.832291 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.498591 s
SSE2 (generic) ... reference result = 8108076510, time = 0.745890 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.450405 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 1.238676 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.699681 s
./speedup data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 0.822457 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.518604 s
SSE2 (generic) ... reference result = 8108076510, time = 0.750936 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.470000 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 1.239929 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.707411 s
./speedup data/i386.txt data/words
std::strstr ... reference result = 8108076510, time = 0.827280 s
SWAR 64-bit (generic) ... reference result = 8108076510, time = 2.535406 s
SSE2 (generic) ... reference result = 8108076510, time = 0.747252 s
SSE4.1 (MPSADBW) ... reference result = 8108076510, time = 1.456153 s
SSE4.1 (MPSADBW unrolled) ... reference result = 8108076510, time = 1.238485 s
SSE4.2 (PCMPESTRM) ... reference result = 8108076510, time = 1.711734 s
================================================
FILE: scalar.cpp
================================================
// Implementation by Daniel Lemire
// https://github.com/WojciechMula/sse4-strstr/issues/2
size_t strstr_naive(const char * hay, size_t size, const char *needle, size_t needlesize) {
if (size == needlesize) {
return memcmp(hay, needle, size) == 0 ? 0 : std::string::npos;
}
const char first = needle[0];
const ssize_t maxpos = ssize_t(size) - ssize_t(needlesize) + 1;
for(ssize_t i = 0; i < maxpos; i++) {
if(hay[i] != first) {
i++;
while( i < maxpos && hay[i] != first ) i++;
if ( i == maxpos ) break;
}
size_t j = 1;
for( ; j < needlesize; ++j)
if(hay[ i + j ] != needle[ j ] ) break;
if( j == needlesize) return i;
}
return std::string::npos;
}
================================================
FILE: src/all.h
================================================
#pragma once
#include "common.h"
#include <utils/bits.cpp>
#include <errno.h>
#include "fixed-memcmp.cpp"
#include "scalar.cpp"
#include "swar64-strstr-v2.cpp"
#include "swar32-strstr-v2.cpp"
#ifdef HAVE_SSE_INSTRUCTIONS
# include <utils/sse.cpp>
# include "sse4-strstr.cpp"
# include "sse4-strstr-unrolled.cpp"
# include "sse4.2-strstr.cpp"
# include "sse2-strstr.cpp"
# include "sse-naive-strstr.cpp"
# include "sse2-needle4.cpp"
#endif
#ifdef HAVE_AVX2_INSTRUCTIONS
# include <utils/avx2.cpp>
# include "avx2-strstr.cpp"
# include "avx2-strstr-v2.cpp"
# include "avx2-naive-strstr.cpp"
# include "avx2-naive-strstr64.cpp"
# include "avx2-naive-unrolled-strstr.cpp"
#endif
#ifdef HAVE_AVX512F_INSTRUCTIONS
# include "avx512f-strstr.cpp"
# include "avx512f-strstr-v2.cpp"
#endif
#ifdef HAVE_AVX512BW_INSTRUCTIONS
# include "avx512bw-strstr-v2.cpp"
# include "avx512bw-strstr-v3.cpp"
#endif
#ifdef HAVE_NEON_INSTRUCTIONS
# include <utils/neon.cpp>
# include "neon-strstr-v2.cpp"
#endif
#ifdef HAVE_AARCH64_ARCHITECTURE
# include "aarch64-strstr-v2.cpp"
#endif
================================================
FILE: src/all_procedures.cpp
================================================
#include "all.h"
#include <string>
#include <vector>
#include <algorithm>
#include <stdexcept>
using str_find_fun = size_t (*)(const char*, size_t, const char*, size_t);
struct Procedures {
struct Item {
str_find_fun proc;
std::string name;
char code;
bool builtin;
Item(str_find_fun proc_, const char* name_, char code_, bool builtin_ = false)
: proc(proc_)
, name(name_)
, code(code_)
, builtin(builtin_) {}
};
std::vector<Item> procedures;
const Item& operator[](char code) {
auto pred = [code](const Item& item){return item.code == code;};
auto it = std::find_if(procedures.begin(), procedures.end(), pred);
if (it == procedures.end()) {
throw std::logic_error("can't find procedure with code '" + std::string(1, code) + "'");
}
return *it;
}
};
size_t strstr_libc(const char* s, size_t, const char* needle, size_t) {
const char* ptr = strstr(s, needle);
if (ptr) {
return ptr - s;
} else {
return std::string::npos;
}
}
Procedures all_procedures() {
Procedures db;
db.procedures.emplace_back(
strstr_naive,
"scalar (naive)",
'a'
);
db.procedures.emplace_back(
strstr_libc,
"std::strstr",
'b',
true
);
db.procedures.emplace_back(
nullptr,
"std::string::find",
'c',
true
);
#define REGISTER(code, name, procedure) \
{ \
str_find_fun f = procedure; \
db.procedures.emplace_back(f, name, code); \
}
REGISTER('d', "SWAR 64-bit (generic)", swar64_strstr_v2);
REGISTER('e', "SWAR 32-bit (generic)", swar32_strstr_v2);
#ifdef HAVE_SSE_INSTRUCTIONS
REGISTER('f', "SSE2 (generic)", sse2_strstr_v2);
REGISTER('g', "SSE4.1 (MPSADBW)", sse4_strstr);
REGISTER('h', "SSE4.1 (MPSADBW unrolled)", sse4_strstr_unrolled);
REGISTER('i', "SSE4.2 (PCMPESTRM)", sse42_strstr);
REGISTER('j', "SSE (naive)", sse_naive_strstr);
REGISTER('v', "SSE2 (4-byte needle)", sse2_strstr_needle4);
REGISTER('w', "SSE2 (4-byte needle v2)", sse2_strstr_needle4_v2);
#endif
#ifdef HAVE_AVX2_INSTRUCTIONS
REGISTER('k', "AVX2 (MPSADBW)", avx2_strstr);
REGISTER('l', "AVX2 (generic)", avx2_strstr_v2);
REGISTER('m', "AVX2 (naive)", avx2_naive_strstr);
REGISTER('n', "AVX2 (naive unrolled)", avx2_naive_unrolled_strstr);
REGISTER('o', "AVX2-wide (naive)", avx2_naive_strstr64);
#endif
#ifdef HAVE_AVX512F_INSTRUCTIONS
REGISTER('p', "AVX512F (MPSADBW-like)", avx512f_strstr);
REGISTER('q', "AVX512F (generic)", avx512f_strstr_v2);
#endif
#ifdef HAVE_AVX512BW_INSTRUCTIONS
REGISTER('r', "AVX512BW (generic)", avx512bw_strstr_v2);
REGISTER('s', "AVX512BW (masked)", avx512bw_strstr_v3);
#endif
#ifdef HAVE_NEON_INSTRUCTIONS
REGISTER('t', "ARM Neon 32 bit (v2)", neon_strstr_v2);
#endif
#ifdef HAVE_AARCH64_ARCHITECTURE
REGISTER('u', "AArch64 64 bit (v2)", aarch64_strstr_v2);
#endif
#undef REGISTER
return db;
}
================================================
FILE: src/application_base.cpp
================================================
class ApplicationBase {
protected:
std::string file;
std::vector<std::string> words;
public:
class Error final {
public:
const std::string message;
public:
Error(const std::string& msg) : message(msg) {}
};
public:
void prepare(const std::string& file_name, const std::string& words_name) {
load_text(file_name);
load_words(words_name);
}
private:
void load_text(const std::string& path) {
FILE* f = fopen(path.c_str(), "rt");
if (f == nullptr) {
throw_errno(path);
}
fseek(f, -1, SEEK_END);
const auto size = ftell(f);
rewind(f);
char* buffer = new char[size];
fread(buffer, size, 1, f);
buffer[size] = 0;
fclose(f);
file = buffer;
delete[] buffer;
}
void load_words(const std::string& path) {
char buffer[1024];
FILE* f = fopen(path.c_str(), "rt");
if (f == nullptr) {
throw_errno(path);
}
while (!feof(f)) {
fgets(buffer, sizeof(buffer), f);
const auto len = strlen(buffer);
if (buffer[len - 1] == '\n') {
buffer[len - 1] = 0;
if (len == 1) // skip empty strings
continue;
}
words.push_back(buffer);
}
fclose(f);
}
void throw_errno(const std::string& prefix) {
const std::string msg = prefix + ": " + std::string(strerror(errno));
throw Error(msg);
}
};
================================================
FILE: src/benchmark.cpp
================================================
#include <cstdio>
#include <cstdint>
#include <cassert>
#include <cstring>
#include <string>
#include <vector>
#include "all_procedures.cpp"
// ------------------------------------------------------------------------
#include <utils/ansi.cpp>
#include "benchmark.h"
#include "application_base.cpp"
class Application final: public ApplicationBase {
Procedures db;
public:
enum class TestType {
OptimisticCase,
Random,
WorstCase
};
struct Parameters {
size_t needle_position;
size_t needle_size;
size_t count;
TestType test_type;
std::string procedure_codes;
};
public:
Application(const Parameters& params)
: db(all_procedures())
, parameters(params) {
prepare();
}
bool operator()() {
// strstr is treated as built-in function by GCC
// it seems it's wiped out in benchmark
const bool measure_stdstring = false;
#if defined(HAVE_NEON_INSTRUCTIONS) && !defined(HAVE_AARCH64_ARCHITECTURE)
// On Raspberry Pi it's terribly slow, but on Aarch64
// the 64-bit procedure is pretty fast
const bool measure_swar64 = false;
#else
const bool measure_swar64 = true;
#endif
if (is_enabled('a')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return strstr_naive(s.data(), s.size(), neddle.data(), neddle.size());
};
measure(find, 'a');
}
if (is_enabled('b')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
const char* res = strstr(s.data(), neddle.data());
if (res != nullptr) {
return res - s.data();
} else {
return std::string::npos;
}
};
measure(find, 'b');
}
if (measure_stdstring && is_enabled('c')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return s.find(neddle);
};
measure(find, 'c');
}
if (measure_swar64 && is_enabled('d')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return swar64_strstr_v2(s, neddle);
};
measure(find, 'd');
}
if (is_enabled('e')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return swar32_strstr_v2(s, neddle);
};
measure(find, 'e');
}
#ifdef HAVE_SSE_INSTRUCTIONS
if (is_enabled('f')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return sse2_strstr_v2(s, neddle);
};
measure(find, 'f');
}
if (is_enabled('g')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return sse4_strstr(s, neddle);
};
measure(find, 'g');
}
if (is_enabled('h')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return sse4_strstr_unrolled(s, neddle);
};
measure(find, 'h');
}
if (is_enabled('i')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return sse42_strstr(s, neddle);
};
measure(find, 'i');
}
if (is_enabled('j')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return sse_naive_strstr(s, neddle);
};
measure(find, 'j');
}
if (is_enabled('v')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return sse2_strstr_needle4(s, neddle);
};
measure(find, 'v');
}
if (is_enabled('w')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return sse2_strstr_needle4_v2(s, neddle);
};
measure(find, 'w');
}
#endif
#ifdef HAVE_AVX2_INSTRUCTIONS
if (is_enabled('k')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx2_strstr(s, neddle);
};
measure(find, 'k');
}
if (is_enabled('l')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx2_strstr_v2(s, neddle);
};
measure(find, 'l');
}
if (is_enabled('m')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx2_naive_strstr(s, neddle);
};
measure(find, 'm');
}
if (is_enabled('n')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx2_naive_unrolled_strstr(s, neddle);
};
measure(find, 'n');
}
if (is_enabled('o')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx2_naive_strstr64(s, neddle);
};
measure(find, 'o');
}
#endif
#ifdef HAVE_AVX512F_INSTRUCTIONS
if (is_enabled('p')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx512f_strstr(s, neddle);
};
measure(find, 'p');
}
if (is_enabled('q')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx512f_strstr_v2(s, neddle);
};
measure(find, 'q');
}
#endif
#ifdef HAVE_AVX512BW_INSTRUCTIONS
if (is_enabled('r')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx512bw_strstr_v2(s, neddle);
};
measure(find, 'r');
}
if (is_enabled('u')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx512bw_strstr_v3(s, neddle);
};
measure(find, 'u');
}
#endif
#ifdef HAVE_NEON_INSTRUCTIONS
if (is_enabled('s')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return neon_strstr_v2(s, neddle);
};
measure(find, 's');
}
#endif
#ifdef HAVE_AARCH64_ARCHITECTURE
if (is_enabled('t')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return aarch64_strstr_v2(s, neddle);
};
measure(find, 't');
}
#endif
return true;
}
static void print_help(const char* progname) {
std::printf("%s needle-position needle-size iteration-count test-name [procedures]\n", progname);
std::puts("");
std::puts("Parameters:");
std::puts("");
std::puts(" needle-position position of the needle");
std::puts(" needle-size length of the needle");
std::puts(" count how many times test is repeated");
std::puts(" test-name one of 'optimistic', 'random', 'worst'");
std::puts(" procedures procedure code(s), listed below [by default all will be tested]");
std::puts("");
std::puts("Test kinds");
std::puts("");
std::puts(" optimistic data before needle contains characters don't present in the needle");
std::puts(" random data before needle contains some random characters");
std::puts(" worst needle has form 'aaa...aaaXaaa...aaa', and data before is filled with the 'a'");
std::puts("");
std::puts("Following procedures are available:");
for (auto& item: all_procedures().procedures) {
printf(" [%c] %s\n", item.code, item.name.c_str());
}
}
private:
volatile size_t sink;
template <typename T_FIND>
void measure(T_FIND find, char code) {
BEST_TIME(/**/,
sink = find(input, needle),
db[code].name.c_str(),
parameters.count,
parameters.needle_position);
}
bool is_enabled(char proc) const {
return (parameters.procedure_codes.empty())
|| (parameters.procedure_codes.find(proc) != std::string::npos);
}
void prepare_needle() {
needle.append(parameters.needle_size/2, 'a');
needle.append(1, 'X');
needle.append(parameters.needle_size - needle.size(), 'a');
}
void prepare_input() {
const size_t padding = 256;
switch (parameters.test_type) {
case TestType::OptimisticCase:
input.assign(parameters.needle_position, '_');
break;
case TestType::WorstCase:
input.assign(parameters.needle_position, 'a');
break;
case TestType::Random:
for (size_t i=0; i < parameters.needle_position; i++) {
const char c = rand() % ('z' - 'a' + 1) + 'a';
input.push_back(c);
}
break;
}
input += needle;
input.append(padding, '_'); // to make sure that memory after the needle is accessible
}
void prepare() {
prepare_needle();
prepare_input();
}
std::string needle;
std::string input;
Parameters parameters;
};
bool parse(int argc, char* argv[], Application::Parameters& p) {
if (argc < 5) {
return false;
}
for (int i=1; i < argc; i++) {
const std::string tmp = argv[i];
if (tmp == "-h" || tmp == "--help")
return false;
}
p.needle_position = atoi(argv[1]);
p.needle_size = atoi(argv[2]);
p.count = atoi(argv[3]);
if (p.needle_size < 3) {
throw std::runtime_error("needle size must be greater than 2");
}
if (p.count == 0) {
throw std::runtime_error("count must be greater than 0");
}
std::string tmp(argv[4]);
if (tmp == "optimistic") {
p.test_type = Application::TestType::OptimisticCase;
} else if (tmp == "worst") {
p.test_type = Application::TestType::WorstCase;
} else if (tmp == "random") {
p.test_type = Application::TestType::Random;
} else {
throw std::runtime_error("expected 'optimistic', 'worst' or 'random', got '" + tmp + "'");
}
if (argc >= 6) {
p.procedure_codes = argv[5];
}
return true;
}
int main(int argc, char* argv[]) {
try {
Application::Parameters params;
if (!parse(argc, argv, params)) {
Application::print_help(argv[0]);
return EXIT_FAILURE;
}
Application app(params);
return app() ? EXIT_SUCCESS : EXIT_FAILURE;
} catch (std::runtime_error& err) {
const auto msg = ansi::seq("Error", ansi::RED);
printf("%s: %s\n", msg.data(), err.what());
return EXIT_FAILURE;
} catch (ApplicationBase::Error& err) {
const auto msg = ansi::seq("Error", ansi::RED);
printf("%s: %s\n", msg.data(), err.message.data());
return EXIT_FAILURE;
}
}
================================================
FILE: src/benchmark.h
================================================
#ifndef _BENCHMARK_H_
#define _BENCHMARK_H_
#include <stdint.h>
#define RDTSC_START(cycles) \
do { \
uint32_t cyc_high, cyc_low; \
__asm volatile("cpuid\n" \
"rdtsc\n" \
"mov %%edx, %0\n" \
"mov %%eax, %1" : \
"=r" (cyc_high), \
"=r"(cyc_low) : \
: /* no read only */ \
"%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
#define RDTSC_STOP(cycles) \
do { \
uint32_t cyc_high, cyc_low; \
__asm volatile("rdtscp\n" \
"mov %%edx, %0\n" \
"mov %%eax, %1\n" \
"cpuid" : \
"=r"(cyc_high), \
"=r"(cyc_low) : \
/* no read only registers */ : \
"%rax", "%rbx", "%rcx", "%rdx" /* clobbers */ \
); \
(cycles) = ((uint64_t)cyc_high << 32) | cyc_low; \
} while (0)
static __attribute__ ((noinline))
uint64_t rdtsc_overhead_func(uint64_t dummy) {
return dummy;
}
uint64_t global_rdtsc_overhead = (uint64_t) UINT64_MAX;
#define RDTSC_SET_OVERHEAD(test, repeat) \
do { \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = UINT64_MAX; \
for (unsigned i = 0; i < repeat; i++) { \
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
test; \
RDTSC_STOP(cycles_final); \
cycles_diff = (cycles_final - cycles_start); \
if (cycles_diff < min_diff) min_diff = cycles_diff; \
} \
global_rdtsc_overhead = min_diff; \
printf("rdtsc_overhead set to %d\n", (int)global_rdtsc_overhead); \
} while (0) \
/*
* Prints the best number of operations per cycle where
* test is the function call, answer is the expected answer generated by
* test, repeat is the number of times we should repeat and size is the
* number of operations represented by test.
*/
#define BEST_TIME(pre, test, test_name, repeat, size) \
do { \
if (global_rdtsc_overhead == UINT64_MAX) { \
RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat); \
} \
printf("%-30s\t: ", test_name); fflush(stdout); \
uint64_t cycles_start, cycles_final, cycles_diff; \
uint64_t min_diff = (uint64_t)-1; \
uint64_t sum_diff = 0; \
for (size_t i = 0; i < repeat; i++) { \
pre; \
__asm volatile("" ::: /* pretend to clobber */ "memory"); \
RDTSC_START(cycles_start); \
test; \
RDTSC_STOP(cycles_final); \
cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \
if (cycles_diff < min_diff) min_diff = cycles_diff; \
sum_diff += cycles_diff; \
} \
uint64_t S = size; \
float cycle_per_op = (min_diff) / (double)S; \
float avg_cycle_per_op = (sum_diff) / ((double)S * repeat); \
printf(" %8.3f cycle/op (best) %8.3f cycle/op (avg)\n", cycle_per_op, avg_cycle_per_op); \
} while (0)
#endif
================================================
FILE: src/speedup.cpp
================================================
#include <cstdio>
#include <cstdint>
#include <cassert>
#include <cstring>
#include <string>
#include <vector>
#include <chrono>
#include "all_procedures.cpp"
// ------------------------------------------------------------------------
#include <utils/ansi.cpp>
#include "application_base.cpp"
class Application final: public ApplicationBase {
Procedures db;
std::size_t count;
const std::string procedure_codes;
public:
struct Parameters {
std::string file_name;
std::string words_name;
size_t count = 10;
std::string procedure_codes;
};
public:
Application(const Parameters& params)
: db(all_procedures())
, count(params.count)
, procedure_codes(params.procedure_codes) {
prepare(params.file_name, params.words_name);
}
bool operator()() {
#if defined(__GNUC__) && !defined(HAVE_NEON_INSTRUCTIONS)
// GNU std::string::find was proven to be utterly slow,
// don't waste our time on reconfirming that fact.
//
// (On Raspberry Pi it's fast, though)
const bool measure_stdstring = false;
#else
const bool measure_stdstring = true;
#endif
#if defined(HAVE_NEON_INSTRUCTIONS) && !defined(HAVE_AARCH64_ARCHITECTURE)
// On Raspberry Pi it's terribly slow, but on Aarch64
// the 64-bit procedure is pretty fast
const bool measure_swar64 = false;
#else
const bool measure_swar64 = true;
#endif
if (is_enabled('a')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return strstr_naive(s.data(), s.size(), neddle.data(), neddle.size());
};
measure(find, 'a');
}
if (is_enabled('b')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
const char* res = strstr(s.data(), neddle.data());
if (res != nullptr) {
return res - s.data();
} else {
return std::string::npos;
}
};
measure(find, 'b');
}
if (measure_stdstring && is_enabled('c')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return s.find(neddle);
};
measure(find, 'c');
}
if (measure_swar64 && is_enabled('d')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return swar64_strstr_v2(s, neddle);
};
measure(find, 'd');
}
if (is_enabled('e')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return swar32_strstr_v2(s, neddle);
};
measure(find, 'e');
}
#ifdef HAVE_SSE_INSTRUCTIONS
if (is_enabled('f')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return sse2_strstr_v2(s, neddle);
};
measure(find, 'f');
}
if (is_enabled('g')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return sse4_strstr(s, neddle);
};
measure(find, 'g');
}
if (is_enabled('h')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return sse4_strstr_unrolled(s, neddle);
};
measure(find, 'h');
}
if (is_enabled('i')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return sse42_strstr(s, neddle);
};
measure(find, 'i');
}
if (is_enabled('j')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return sse_naive_strstr(s, neddle);
};
measure(find, 'j');
}
#endif
#ifdef HAVE_AVX2_INSTRUCTIONS
if (is_enabled('k')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx2_strstr(s, neddle);
};
measure(find, 'k');
}
if (is_enabled('l')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx2_strstr_v2(s, neddle);
};
measure(find, 'l');
}
if (is_enabled('m')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx2_naive_strstr(s, neddle);
};
measure(find, 'm');
}
if (is_enabled('n')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx2_naive_unrolled_strstr(s, neddle);
};
measure(find, 'n');
}
if (is_enabled('o')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx2_naive_strstr64(s, neddle);
};
measure(find, 'o');
}
#endif
#ifdef HAVE_AVX512F_INSTRUCTIONS
if (is_enabled('p')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx512f_strstr(s, neddle);
};
measure(find, 'p');
}
if (is_enabled('q')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx512f_strstr_v2(s, neddle);
};
measure(find, 'q');
}
#endif
#ifdef HAVE_AVX512BW_INSTRUCTIONS
if (is_enabled('r')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return avx512bw_strstr_v2(s, neddle);
};
measure(find, 'r');
}
#endif
#ifdef HAVE_NEON_INSTRUCTIONS
if (is_enabled('s')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return neon_strstr_v2(s, neddle);
};
measure(find, 's');
}
#endif
#ifdef HAVE_AARCH64_ARCHITECTURE
if (is_enabled('t')) {
auto find = [](const std::string& s, const std::string& neddle) -> size_t {
return aarch64_strstr_v2(s, neddle);
};
measure(find, 't');
}
#endif
return true;
}
static void print_help(const char* progname) {
std::printf("%s file words [count] [procedure]\n", progname);
std::puts("");
std::puts("Parameters:");
std::puts("");
std::puts(" file - arbitrary file");
std::puts(" words - list of words in separate lines");
std::puts(" count - repeat count (optional, default = 10)");
std::puts(" procedure - letter(s) from square brackets (by default all functions are checked)");
std::puts("");
std::puts("Following procedures ara available:");
for (auto& item: all_procedures().procedures) {
printf(" [%c] %s\n", item.code, item.name.c_str());
}
}
private:
template <typename T_FIND>
void measure(T_FIND find, char code) {
printf("%-40s... ", db[code].name.c_str());
fflush(stdout);
size_t result = 0;
const auto t1 = std::chrono::high_resolution_clock::now();
auto k = count;
while (k != 0) {
for (const auto& word: words) {
result += find(file, word);
}
k--;
}
const auto t2 = std::chrono::high_resolution_clock::now();
const std::chrono::duration<double> td = t2-t1;
printf("reference result = %lu, time = %10.6f s\n", result, td.count());
}
bool is_enabled(char proc) const {
return (procedure_codes.empty())
|| (procedure_codes.find(proc) != std::string::npos);
}
};
bool parse(int argc, char* argv[], Application::Parameters& p) {
if (argc < 3) {
return false;
}
for (int i=1; i < argc; i++) {
const std::string tmp = argv[i];
if (tmp == "-h" || tmp == "--help")
return false;
}
p.file_name = argv[1];
p.words_name = argv[2];
if (argc >= 4) {
size_t tmp = atoi(argv[3]);
if (tmp > 0) {
p.count = tmp;
} else {
printf("repeat count '%s' invalid, keeping default %lu\n", argv[3], p.count);
}
}
if (argc >= 5) {
p.procedure_codes = argv[4];
}
return true;
}
int main(int argc, char* argv[]) {
try {
Application::Parameters params;
if (!parse(argc, argv, params)) {
Application::print_help(argv[0]);
return EXIT_FAILURE;
}
Application app(params);
return app() ? EXIT_SUCCESS : EXIT_FAILURE;
} catch (ApplicationBase::Error& err) {
const auto msg = ansi::seq("Error: ", ansi::RED);
printf("%s: %s\n", msg.data(), err.message.data());
return EXIT_FAILURE;
}
}
================================================
FILE: src/unittests.cpp
================================================
#include <cstdio>
#include <cstdint>
#include <cassert>
#include <cstring>
#include <string>
#include <vector>
#include "all.h"
#include <utils/ansi.cpp>
#include "all_procedures.cpp"
bool test(const char* name, str_find_fun strstr_function) {
std::printf("%s... ", name);
std::fflush(stdout);
for (size_t size = 1; size < 64; size++) {
const std::string neddle = "$" + std::string(size, 'x') + "#";
for (size_t n = 0; n < 3*16; n++) {
const std::string prefix(n, '.');
for (size_t k = 0; k < 3*16; k++) {
// '.' * k + '$' + 'x' * size + '#' + '.' * k
const std::string suffix(k, '.');
const std::string str = prefix + neddle + suffix;
const auto result = strstr_function(str.data(), str.size(), neddle.data(), neddle.size());
if (result != n) {
printf("%s\n", ansi::seq("FAILED", ansi::RED).c_str());
printf(" string = '%s' (length %lu)\n", str.data(), str.size());
printf(" neddle = '%s' (length %lu)\n", neddle.data(), neddle.size());
printf(" expected result = %lu, actual result = %lu\n", n, result);
return false;
}
}
}
}
const auto msg = ansi::seq("OK", ansi::GREEN);
printf("%s\n", msg.c_str());
return true;
}
int main() {
int ret = EXIT_SUCCESS;
puts("running unit tests");
auto db = all_procedures();
for (auto& item: db.procedures) {
if (item.builtin) {
continue;
}
if (!test(item.name.c_str(), item.proc)) {
ret = EXIT_FAILURE;
}
}
return ret;
}
================================================
FILE: src/validate.cpp
================================================
#include <cstdio>
#include <cstdint>
#include <cassert>
#include <cstring>
#include <string>
#include <vector>
// ------------------------------------------------------------------------
#include "all_procedures.cpp"
// ------------------------------------------------------------------------
#include <utils/ansi.cpp>
#include "application_base.cpp"
class Application final: public ApplicationBase {
public:
Application(const std::string& file_name, const std::string& words_name) {
prepare(file_name, words_name);
}
bool run() {
const auto n = words.size();
auto db = all_procedures();
for (size_t i = 0; i < n; i++) {
if (i % 100 == 0) {
print_progress(i, n);
}
const auto& word = words[i];
const size_t reference = file.find(word);
for (auto& item: db.procedures) {
if (item.builtin) {
continue;
}
const size_t result = item.proc(file.data(), file.size(), word.data(), word.size());
if (reference != result) {
putchar('\n');
const auto msg = ansi::seq("ERROR", ansi::RED);
printf("%s: std::find result = %lu, %s = %lu\n",
msg.data(), reference, item.name.c_str(), result);
printf("word: '%s' (length %lu)\n", word.data(), word.size());
return false;
}
}
}
print_progress(n, n);
putchar('\n');
const auto msg = ansi::seq("OK", ansi::GREEN);
printf("%s\n", msg.c_str());
return true;
}
static void print_help(const char* progname) {
std::printf("usage: %s [file] [words]\n", progname);
std::puts("");
std::puts("Search all words in a file using std::string::find and SSE4 procedure");
std::puts("");
std::puts("Parameters:");
std::puts("");
std::puts(" file - arbitrary file");
std::puts(" words - list of words in separate lines");
}
private:
void print_progress(size_t pos, size_t n) {
printf("validating... %0.2f%% (%lu/%lu)\r", 100.0*pos/n, pos, n);
fflush(stdout);
}
};
int main(int argc, char* argv[]) {
if (argc == 3) {
try {
Application app(argv[1], argv[2]);
const auto ret = app.run();
return ret ? EXIT_SUCCESS : EXIT_FAILURE;
} catch (ApplicationBase::Error& err) {
const auto msg = ansi::seq("Error: ", ansi::RED);
printf("%s: %s\n", msg.data(), err.message.data());
return EXIT_FAILURE;
}
} else {
Application::print_help(argv[0]);
return EXIT_FAILURE;
}
}
================================================
FILE: sse-naive-strstr.cpp
================================================
// Method descibed in https://arxiv.org/pdf/1612.01506.pdf
//
// Implementation by Daniel Lemire
// https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/extra/simd/substring/substring.c
size_t FORCE_INLINE sse_naive_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {
assert(k > 0);
assert(n > 0);
if (n == k) {
return (memcmp(s, needle, k) == 0) ? 0 : std::string::npos;
}
for (size_t i = 0; i < n - k + 1; i += 16) {
uint16_t found = 0xffff;
for (size_t j = 0; (j < k) && (found != 0) ; ++j) {
const __m128i textvector = _mm_loadu_si128((const __m128i *)(s + i + j));
const __m128i needlevector = _mm_set1_epi8(needle[j]);
uint16_t bitmask = _mm_movemask_epi8(_mm_cmpeq_epi8(textvector, needlevector));
found = found & bitmask;
}
if (found != 0) {
return i + __builtin_ctz(found);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse_naive_strstr(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
result = sse_naive_strstr_anysize(s, n, needle, k);
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
// ------------------------------------------------------------------------
size_t sse_naive_strstr(const std::string& s, const std::string& needle) {
return sse_naive_strstr(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: sse2-needle4.cpp
================================================
size_t FORCE_INLINE sse2_needle4(const char* s, size_t n, const char* needle, size_t k) {
uint32_t u32;
memcpy(&u32, needle, sizeof(u32));
const __m128i v_needle = _mm_set1_epi32(u32);
const __m128i shuffle = _mm_setr_epi8(0, 1, 2, 3,
1, 2, 3, 4,
2, 3, 4, 5,
3, 4, 5, 6);
for (size_t i = 0; i < n - k + 1; i += 4) {
// 1. load 7 bytes:
// [abcd|efg?|????|????]
uint64_t u64;
memcpy(&u64, &s[i], sizeof(u64));
const __m128i t0 = _mm_cvtsi64x_si128(u64);
// 2. make all possible 4-byte substrings
// [abcd|bcde|cdef|defg]
const __m128i t1 = _mm_shuffle_epi8(shuffle, t0);
// 3. compare the 4-byte substrings with the needle
const __m128i t2 = _mm_cmpeq_epi32(v_needle, t1);
const int mask = _mm_movemask_ps((__m128)t2);
if (mask != 0) {
return i + __builtin_clz(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse2_strstr_needle4(const char* s, size_t n, const char* needle, size_t k) {
if (k != 4) {
return std::string::npos;
}
return sse2_needle4(s, n, needle, k);
}
// ------------------------------------------------------------------------
size_t sse2_strstr_needle4(const std::string& s, const std::string& needle) {
return sse2_strstr_needle4(s.data(), s.size(), needle.data(), needle.size());
}
size_t FORCE_INLINE sse2_needle4_v2(const char* s, size_t n, const char* needle, size_t k) {
uint32_t u32;
memcpy(&u32, needle, sizeof(u32));
const __m128i v_needle = _mm_set1_epi32(u32);
const __m128i shuffle0 = _mm_setr_epi8(0, 1, 2, 3,
1, 2, 3, 4,
2, 3, 4, 5,
3, 4, 5, 6);
const __m128i shuffle1 = _mm_setr_epi8(4, 5, 6, 7,
5, 6, 7, 8,
6, 7, 8, 9,
7, 8, 9, 10);
for (size_t i = 0; i < n - k + 1; i += 8) {
// 1. load 15 ytes:
// [abcd|efgh|ijkl|????]
const __m128i input = _mm_loadu_si128((const __m128i*)(s + i));
// 2a. make all possible 4-byte substrings
// lo = [abcd|bcde|cdef|defg]
const __m128i lo = _mm_shuffle_epi8(shuffle0, input);
// hi = [efgh|fghi|ghij|hijk]
const __m128i hi = _mm_shuffle_epi8(shuffle1, input);
// 3. compare the 4-byte substrings with the needle
const __m128i eq_lo = _mm_cmpeq_epi32(v_needle, lo);
const __m128i eq_hi = _mm_cmpeq_epi32(v_needle, hi);
// to perform single movemask in the main loop
const __m128i t0 = _mm_or_si128(eq_lo, eq_hi);
const int mask = _mm_movemask_ps((__m128)t0);
if (mask != 0) {
const int mask_lo = _mm_movemask_ps((__m128)eq_lo);
if (mask_lo != 0) {
return i + __builtin_clz(mask_lo);
} else {
return i + 4 + __builtin_clz(mask);
}
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse2_strstr_needle4_v2(const char* s, size_t n, const char* needle, size_t k) {
if (k != 4) {
return std::string::npos;
}
return sse2_needle4_v2(s, n, needle, k);
}
// ------------------------------------------------------------------------
size_t sse2_strstr_needle4_v2(const std::string& s, const std::string& needle) {
return sse2_strstr_needle4_v2(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: sse2-strstr.cpp
================================================
// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html
size_t FORCE_INLINE sse2_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {
assert(k > 0);
assert(n > 0);
const __m128i first = _mm_set1_epi8(needle[0]);
const __m128i last = _mm_set1_epi8(needle[k - 1]);
for (size_t i = 0; i < n; i += 16) {
const __m128i block_first = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
const __m128i block_last = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i + k - 1));
const __m128i eq_first = _mm_cmpeq_epi8(first, block_first);
const __m128i eq_last = _mm_cmpeq_epi8(last, block_last);
uint16_t mask = _mm_movemask_epi8(_mm_and_si128(eq_first, eq_last));
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask);
if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
template <size_t k, typename MEMCMP>
size_t FORCE_INLINE sse2_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {
assert(k > 0);
assert(n > 0);
const __m128i first = _mm_set1_epi8(needle[0]);
const __m128i last = _mm_set1_epi8(needle[k - 1]);
for (size_t i = 0; i < n; i += 16) {
const __m128i block_first = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
const __m128i block_last = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i + k - 1));
const __m128i eq_first = _mm_cmpeq_epi8(first, block_first);
const __m128i eq_last = _mm_cmpeq_epi8(last, block_last);
uint32_t mask = _mm_movemask_epi8(_mm_and_si128(eq_first, eq_last));
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask);
if (memcmp_fun(s + i + bitpos + 1, needle + 1)) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse2_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
switch (k) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2:
result = sse2_strstr_memcmp<2>(s, n, needle, always_true);
break;
case 3:
result = sse2_strstr_memcmp<3>(s, n, needle, memcmp1);
break;
case 4:
result = sse2_strstr_memcmp<4>(s, n, needle, memcmp2);
break;
case 5:
result = sse2_strstr_memcmp<5>(s, n, needle, memcmp4);
break;
case 6:
result = sse2_strstr_memcmp<6>(s, n, needle, memcmp4);
break;
case 7:
result = sse2_strstr_memcmp<7>(s, n, needle, memcmp5);
break;
case 8:
result = sse2_strstr_memcmp<8>(s, n, needle, memcmp6);
break;
case 9:
result = sse2_strstr_memcmp<9>(s, n, needle, memcmp8);
break;
case 10:
result = sse2_strstr_memcmp<10>(s, n, needle, memcmp8);
break;
case 11:
result = sse2_strstr_memcmp<11>(s, n, needle, memcmp9);
break;
case 12:
result = sse2_strstr_memcmp<12>(s, n, needle, memcmp10);
break;
default:
result = sse2_strstr_anysize(s, n, needle, k);
break;
}
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
// ------------------------------------------------------------------------
size_t sse2_strstr_v2(const std::string& s, const std::string& needle) {
return sse2_strstr_v2(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: sse4-strstr-unrolled.cpp
================================================
// Note: it appears that these specialized functions do not help.
// But I decided to left them, just in case.
// use functions/templates dealing with certain substring length
//#define ENABLE_SSE4_LENGTH_SPECIALIZATIONS
// When defined use sse4_strstr_unrolled_memcmp template,
// otherwise use just sse4_strstr_unrolled_max20 and sse4_strstr_unrolled_max36
//#define ENABLE_SSE4_MEMCMP_TEMPLATES
size_t sse4_strstr_unrolled_anysize(const char* s, size_t n, const char* needle, size_t needle_size) {
assert(needle_size > 4);
assert(n > 0);
const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));
const __m128i zeros = _mm_setzero_si128();
__m128i prev = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
__m128i curr;
for (size_t i = 0; i < n; i += 16) {
curr = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i + 16));
const __m128i data0 = prev;
const __m128i data1 = _mm_alignr_epi8(curr, prev, 8);
const __m128i result0 = _mm_mpsadbw_epu8(data0, prefix, 0);
const __m128i result1 = _mm_mpsadbw_epu8(data1, prefix, 0);
prev = curr;
const __m128i result = _mm_packus_epi16(result0, result1);
const __m128i cmp = _mm_cmpeq_epi8(result, zeros);
unsigned mask = _mm_movemask_epi8(cmp);
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask);
if (memcmp(s + i + bitpos + 4, needle + 4, needle_size - 4) == 0) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
template <size_t k, typename MEMCMP>
size_t sse4_strstr_unrolled_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {
assert(k > 4);
assert(n > 0);
const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));
const __m128i zeros = _mm_setzero_si128();
__m128i prev = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));
__m128i curr;
for (size_t i = 0; i < n; i += 16) {
curr = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i + 16));
const __m128i data0 = prev;
const __m128i data1 = _mm_alignr_epi8(curr, prev, 8);
const __m128i result0 = _mm_mpsadbw_epu8(data0, prefix, 0);
const __m128i result1 = _mm_mpsadbw_epu8(data1, prefix, 0);
prev = curr;
const __m128i result = _mm_packus_epi16(result0, result1);
const __m128i cmp = _mm_cmpeq_epi8(result, zeros);
unsigned mask = _mm_movemask_epi8(cmp);
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask);
if (memcmp_fun(s + i + bitpos + 4, needle + 4)) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse4_strstr_unrolled_max20(const char* s, size_t n, const char* needle, size_t needle_size) {
const __m128i zeros = _mm_setzero_si128();
const __m128i prefix = sse::load(needle);
const __m128i suffix = sse::load(needle + 4);
const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4);
for (size_t i = 0; i < n; i += 8) {
const __m128i data = sse::load(s + i);
const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);
const __m128i cmp = _mm_cmpeq_epi16(result, zeros);
unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask)/2;
const __m128i str = sse::load(s + i + bitpos + 4);
const __m128i cmp = _mm_cmpeq_epi8(str, suffix);
if (_mm_testc_si128(cmp, suff_mask)) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse4_strstr_unrolled_max36(const char* s, size_t n, const char* needle, size_t needle_size) {
const __m128i zeros = _mm_setzero_si128();
const __m128i prefix = sse::load(needle);
const __m128i suffix1 = sse::load(needle + 4);
const __m128i suffix2 = sse::load(needle + 16 + 4);
const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4));
for (size_t i = 0; i < n; i += 8) {
const __m128i data = sse::load(s + i);
const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);
const __m128i cmp = _mm_cmpeq_epi16(result, zeros);
unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask)/2;
const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1);
const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2);
const __m128i c3 = _mm_or_si128(c2, suff_mask);
const __m128i tmp = _mm_and_si128(c1, c3);
if (_mm_movemask_epi8(tmp) == 0xffff) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse4_strstr_unrolled_len3(const char* s, size_t n, const char* needle) {
const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));
const __m128i zeros = _mm_setzero_si128();
for (size_t i = 0; i < n; i += 8) {
const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
const __m128i lastbyte = _mm_cvtepu8_epi16(_mm_srli_si128(data, 3));
const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);
const __m128i cmp = _mm_cmpeq_epi16(_mm_sub_epi16(result, lastbyte), zeros);
unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;
if (mask != 0) {
return i + bits::get_first_bit_set(mask)/2;
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse4_strstr_unrolled_len4(const char* s, size_t n, const char* needle) {
const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));
const __m128i zeros = _mm_setzero_si128();
for (size_t i = 0; i < n; i += 8) {
const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);
const __m128i cmp = _mm_cmpeq_epi16(result, zeros);
unsigned mask = _mm_movemask_epi8(cmp);
if (mask != 0) {
return i + bits::get_first_bit_set(mask)/2;
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse4_strstr_unrolled(const char* s, size_t n, const char* needle, size_t needle_size) {
size_t result = std::string::npos;
if (n < needle_size) {
return result;
}
switch (needle_size) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2: {
const char* res = reinterpret_cast<const char*>(strstr(s, needle));
return (res != nullptr) ? res - s : std::string::npos;
}
case 3:
result = sse4_strstr_unrolled_len3(s, n, needle);
break;
case 4:
result = sse4_strstr_unrolled_len4(s, n, needle);
break;
#ifdef ENABLE_SSE4_LENGTH_SPECIALIZATIONS
#ifdef ENABLE_SSE4_MEMCMP_TEMPLATES
case 5:
result = sse4_strstr_unrolled_memcmp<5>(s, n, needle, memcmp1);
break;
case 6:
result = sse4_strstr_unrolled_memcmp<6>(s, n, needle, memcmp2);
break;
case 7:
result = sse4_strstr_unrolled_memcmp<7>(s, n, needle, memcmp3);
break;
case 8:
result = sse4_strstr_unrolled_memcmp<8>(s, n, needle, memcmp4);
break;
case 9:
result = sse4_strstr_unrolled_memcmp<9>(s, n, needle, memcmp5);
break;
case 10:
result = sse4_strstr_unrolled_memcmp<10>(s, n, needle, memcmp6);
break;
case 11:
result = sse4_strstr_unrolled_memcmp<11>(s, n, needle, memcmp7);
break;
case 12:
result = sse4_strstr_unrolled_memcmp<12>(s, n, needle, memcmp8);
break;
case 13:
result = sse4_strstr_unrolled_memcmp<13>(s, n, needle, memcmp9);
break;
case 14:
result = sse4_strstr_unrolled_memcmp<14>(s, n, needle, memcmp10);
break;
#else
case 5: case 6: case 7: case 8:
case 9: case 10: case 11: case 12:
case 13: case 14: /* 5 .. 14 */
#endif // ENABLE_SSE4_MEMCMP_TEMPLATES
case 15: case 16: case 17: case 18: case 19:
case 20: /* 15..20 */
result = sse4_strstr_unrolled_max20(s, n, needle, needle_size);
break;
case 21: case 22: case 23: case 24: case 25:
case 26: case 27: case 28: case 29: case 30:
case 31: case 32: case 33: case 34: case 35:
case 36: /* 21..36 */
result = sse4_strstr_unrolled_max36(s, n, needle, needle_size);
break;
#endif // ENABLE_SSE4_LENGTH_SPECIALIZATIONS
default:
result = sse4_strstr_unrolled_anysize(s, n, needle, needle_size);
break;
}
if (result <= n - needle_size) {
return result;
} else {
return std::string::npos;
}
}
// --------------------------------------------------
size_t sse4_strstr_unrolled(const std::string& s, const std::string& needle) {
return sse4_strstr_unrolled(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: sse4-strstr.cpp
================================================
size_t sse4_strstr_anysize(const char* s, size_t n, const char* needle, size_t needle_size) {
assert(needle_size > 4);
assert(n > 0);
const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));
const __m128i zeros = _mm_setzero_si128();
for (size_t i = 0; i < n; i += 8) {
const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);
const __m128i cmp = _mm_cmpeq_epi16(result, zeros);
unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask)/2;
if (memcmp(s + i + bitpos + 4, needle + 4, needle_size - 4) == 0) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
template <size_t k, typename MEMCMP>
size_t sse4_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {
assert(k > 4);
assert(n > 0);
const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));
const __m128i zeros = _mm_setzero_si128();
for (size_t i = 0; i < n; i += 8) {
const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);
const __m128i cmp = _mm_cmpeq_epi16(result, zeros);
unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask)/2;
if (memcmp_fun(s + i + bitpos + 4, needle + 4)) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse4_strstr_max20(const char* s, size_t n, const char* needle, size_t needle_size) {
const __m128i zeros = _mm_setzero_si128();
const __m128i prefix = sse::load(needle);
const __m128i suffix = sse::load(needle + 4);
const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4);
for (size_t i = 0; i < n; i += 8) {
const __m128i data = sse::load(s + i);
const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);
const __m128i cmp = _mm_cmpeq_epi16(result, zeros);
unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask)/2;
const __m128i str = sse::load(s + i + bitpos + 4);
const __m128i cmp = _mm_cmpeq_epi8(str, suffix);
if (_mm_testc_si128(cmp, suff_mask)) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse4_strstr_max36(const char* s, size_t n, const char* needle, size_t needle_size) {
const __m128i zeros = _mm_setzero_si128();
const __m128i prefix = sse::load(needle);
const __m128i suffix1 = sse::load(needle + 4);
const __m128i suffix2 = sse::load(needle + 16 + 4);
const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4));
for (size_t i = 0; i < n; i += 8) {
const __m128i data = sse::load(s + i);
const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);
const __m128i cmp = _mm_cmpeq_epi16(result, zeros);
unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask)/2;
const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1);
const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2);
const __m128i c3 = _mm_or_si128(c2, suff_mask);
const __m128i tmp = _mm_and_si128(c1, c3);
if (_mm_movemask_epi8(tmp) == 0xffff) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse4_strstr_len3(const char* s, size_t n, const char* needle) {
const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));
const __m128i zeros = _mm_setzero_si128();
for (size_t i = 0; i < n; i += 8) {
const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
const __m128i lastbyte = _mm_cvtepu8_epi16(_mm_srli_si128(data, 3));
const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);
const __m128i cmp = _mm_cmpeq_epi16(_mm_sub_epi16(result, lastbyte), zeros);
unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;
if (mask != 0) {
return i + bits::get_first_bit_set(mask)/2;
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse4_strstr_len4(const char* s, size_t n, const char* needle) {
const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));
const __m128i zeros = _mm_setzero_si128();
for (size_t i = 0; i < n; i += 8) {
const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));
const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);
const __m128i cmp = _mm_cmpeq_epi16(result, zeros);
unsigned mask = _mm_movemask_epi8(cmp);
if (mask != 0) {
return i + bits::get_first_bit_set(mask)/2;
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse4_strstr(const char* s, size_t n, const char* needle, size_t needle_size) {
size_t result = std::string::npos;
if (n < needle_size) {
return result;
}
switch (needle_size) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2: {
const char* res = reinterpret_cast<const char*>(strstr(s, needle));
return (res != nullptr) ? res - s : std::string::npos;
}
case 3:
result = sse4_strstr_len3(s, n, needle);
break;
case 4:
result = sse4_strstr_len4(s, n, needle);
break;
#if 1
case 5:
result = sse4_strstr_memcmp<5>(s, n, needle, memcmp1);
break;
case 6:
result = sse4_strstr_memcmp<6>(s, n, needle, memcmp2);
break;
case 7:
result = sse4_strstr_memcmp<7>(s, n, needle, memcmp3);
break;
case 8:
result = sse4_strstr_memcmp<8>(s, n, needle, memcmp4);
break;
case 9:
result = sse4_strstr_memcmp<9>(s, n, needle, memcmp5);
break;
case 10:
result = sse4_strstr_memcmp<10>(s, n, needle, memcmp6);
break;
case 11:
result = sse4_strstr_memcmp<11>(s, n, needle, memcmp7);
break;
case 12:
result = sse4_strstr_memcmp<12>(s, n, needle, memcmp8);
break;
case 13:
result = sse4_strstr_memcmp<13>(s, n, needle, memcmp9);
break;
case 14:
result = sse4_strstr_memcmp<14>(s, n, needle, memcmp10);
break;
#else
case 5: case 6: case 7: case 8:
case 9: case 10: case 11: case 12:
case 13: case 14: /* 5 .. 14 */
#endif
case 15: case 16: case 17: case 18: case 19:
case 20: /* 15..20 */
result = sse4_strstr_max20(s, n, needle, needle_size);
break;
case 21: case 22: case 23: case 24: case 25:
case 26: case 27: case 28: case 29: case 30:
case 31: case 32: case 33: case 34: case 35:
case 36: /* 21..36 */
result = sse4_strstr_max36(s, n, needle, needle_size);
break;
default:
result = sse4_strstr_anysize(s, n, needle, needle_size);
break;
}
if (result <= n - needle_size) {
return result;
} else {
return std::string::npos;
}
}
// --------------------------------------------------
size_t sse4_strstr(const std::string& s, const std::string& needle) {
return sse4_strstr(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: sse4.2-strstr.cpp
================================================
/* Usage of PCMPESTRM instruction from SSE 4.1 */
size_t FORCE_INLINE sse42_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {
assert(k > 0);
assert(n > 0);
const __m128i N = _mm_loadu_si128((__m128i*)needle);
for (size_t i = 0; i < n; i += 16) {
const int mode = _SIDD_UBYTE_OPS
| _SIDD_CMP_EQUAL_ORDERED
| _SIDD_BIT_MASK;
const __m128i D = _mm_loadu_si128((__m128i*)(s + i));
const __m128i res = _mm_cmpestrm(N, k, D, n - i, mode);
uint64_t mask = _mm_cvtsi128_si64(res);
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask);
// we know that at least the first character of needle matches
if (memcmp(s + i + bitpos + 1, needle + 1, k - 1) == 0) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
template <size_t k, typename MEMCMP>
size_t FORCE_INLINE sse42_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {
assert(k > 0);
assert(n > 0);
const __m128i N = _mm_loadu_si128((__m128i*)needle);
for (size_t i = 0; i < n; i += 16) {
const int mode = _SIDD_UBYTE_OPS
| _SIDD_CMP_EQUAL_ORDERED
| _SIDD_BIT_MASK;
const __m128i D = _mm_loadu_si128((__m128i*)(s + i));
const __m128i res = _mm_cmpestrm(N, k, D, n - i, mode);
uint64_t mask = _mm_cvtsi128_si64(res);
while (mask != 0) {
const auto bitpos = bits::get_first_bit_set(mask);
if (memcmp_fun(s + i + bitpos + 1, needle + 1)) {
return i + bitpos;
}
mask = bits::clear_leftmost_set(mask);
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t sse42_strstr(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
switch (k) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2:
result = sse42_strstr_memcmp<2>(s, n, needle, memcmp1);
break;
case 3:
result = sse42_strstr_memcmp<3>(s, n, needle, memcmp2);
break;
case 4:
result = sse42_strstr_memcmp<4>(s, n, needle, memcmp3);
break;
case 5:
result = sse42_strstr_memcmp<5>(s, n, needle, memcmp4);
break;
case 6:
result = sse42_strstr_memcmp<6>(s, n, needle, memcmp5);
break;
case 7:
result = sse42_strstr_memcmp<7>(s, n, needle, memcmp6);
break;
case 8:
result = sse42_strstr_memcmp<8>(s, n, needle, memcmp7);
break;
case 9:
result = sse42_strstr_memcmp<9>(s, n, needle, memcmp8);
break;
case 10:
result = sse42_strstr_memcmp<10>(s, n, needle, memcmp9);
break;
case 11:
result = sse42_strstr_memcmp<11>(s, n, needle, memcmp10);
break;
case 12:
result = sse42_strstr_memcmp<12>(s, n, needle, memcmp11);
break;
default:
result = sse42_strstr_anysize(s, n, needle, k);
break;
}
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
// ------------------------------------------------------------------------
size_t sse42_strstr(const std::string& s, const std::string& needle) {
return sse42_strstr(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: swar32-strstr-v2.cpp
================================================
size_t FORCE_INLINE swar32_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {
assert(k > 0);
assert(n > 0);
const uint32_t first = 0x01010101u * static_cast<uint8_t>(needle[0]);
const uint32_t last = 0x01010101u * static_cast<uint8_t>(needle[k - 1]);
uint32_t* block_first = reinterpret_cast<uint32_t*>(const_cast<char*>(s));
uint32_t* block_last = reinterpret_cast<uint32_t*>(const_cast<char*>(s + k - 1));
// 2. sequence scan
for (auto i=0u; i < n; i+=4, block_first++, block_last++) {
// 0 bytes in eq indicate matching chars
const uint32_t eq = (*block_first ^ first) | (*block_last ^ last);
// 7th bit set if lower 7 bits are zero
const uint32_t t0 = (~eq & 0x7f7f7f7fu) + 0x01010101u;
// 7th bit set if 7th bit is zero
const uint32_t t1 = (~eq & 0x80808080u);
uint32_t zeros = t0 & t1;
size_t j = 0;
while (zeros) {
if (zeros & 0x80) {
const char* substr = reinterpret_cast<char*>(block_first) + j + 1;
if (memcmp(substr, needle + 1, k - 2) == 0) {
return i + j;
}
}
zeros >>= 8;
j += 1;
}
}
return std::string::npos;
}
template <size_t k, typename MEMCMP>
size_t FORCE_INLINE swar32_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {
assert(n > 0);
const uint32_t first = 0x01010101u * static_cast<uint8_t>(needle[0]);
const uint32_t last = 0x01010101u * static_cast<uint8_t>(needle[k - 1]);
uint32_t* block_first = reinterpret_cast<uint32_t*>(const_cast<char*>(s));
uint32_t* block_last = reinterpret_cast<uint32_t*>(const_cast<char*>(s + k - 1));
// 2. sequence scan
for (auto i=0u; i < n; i+=4, block_first++, block_last++) {
const uint32_t eq = (*block_first ^ first) | (*block_last ^ last);
const uint32_t t0 = (~eq & 0x7f7f7f7fu) + 0x01010101u;
const uint32_t t1 = (~eq & 0x80808080u);
uint32_t zeros = t0 & t1;
size_t j = 0;
while (zeros) {
if (zeros & 0x80) {
const char* substr = reinterpret_cast<char*>(block_first) + j + 1;
if (memcmp_fun(substr, needle + 1)) {
return i + j;
}
}
zeros >>= 8;
j += 1;
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t swar32_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
switch (k) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2:
result = swar32_strstr_memcmp<2>(s, n, needle, always_true);
break;
case 3:
result = swar32_strstr_memcmp<3>(s, n, needle, memcmp1);
break;
case 4:
result = swar32_strstr_memcmp<4>(s, n, needle, memcmp2);
break;
case 5:
// Note: use memcmp4 rather memcmp3, as the last character
// of needle is already proven to be equal
result = swar32_strstr_memcmp<5>(s, n, needle, memcmp4);
break;
case 6:
result = swar32_strstr_memcmp<6>(s, n, needle, memcmp4);
break;
case 7:
result = swar32_strstr_memcmp<7>(s, n, needle, memcmp5);
break;
case 8:
result = swar32_strstr_memcmp<8>(s, n, needle, memcmp6);
break;
case 9:
// Note: use memcmp8 rather memcmp7 for the same reason as above.
result = swar32_strstr_memcmp<9>(s, n, needle, memcmp8);
break;
case 10:
result = swar32_strstr_memcmp<10>(s, n, needle, memcmp8);
break;
case 11:
result = swar32_strstr_memcmp<11>(s, n, needle, memcmp9);
break;
case 12:
result = swar32_strstr_memcmp<12>(s, n, needle, memcmp10);
break;
default:
result = swar32_strstr_anysize(s, n, needle, k);
break;
}
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
size_t swar32_strstr_v2(const std::string& s, const std::string& needle) {
return swar32_strstr_v2(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: swar64-strstr-v2.cpp
================================================
size_t FORCE_INLINE swar64_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {
assert(k > 0);
assert(n > 0);
const uint64_t first = 0x0101010101010101llu * static_cast<uint8_t>(needle[0]);
const uint64_t last = 0x0101010101010101llu * static_cast<uint8_t>(needle[k - 1]);
uint64_t* block_first = reinterpret_cast<uint64_t*>(const_cast<char*>(s));
uint64_t* block_last = reinterpret_cast<uint64_t*>(const_cast<char*>(s + k - 1));
// 2. sequence scan
for (auto i=0u; i < n; i+=8, block_first++, block_last++) {
// 0 bytes in eq indicate matching chars
const uint64_t eq = (*block_first ^ first) | (*block_last ^ last);
// 7th bit set if lower 7 bits are zero
const uint64_t t0 = (~eq & 0x7f7f7f7f7f7f7f7fllu) + 0x0101010101010101llu;
// 7th bit set if 7th bit is zero
const uint64_t t1 = (~eq & 0x8080808080808080llu);
uint64_t zeros = t0 & t1;
size_t j = 0;
while (zeros) {
if (zeros & 0x80) {
const char* substr = reinterpret_cast<char*>(block_first) + j + 1;
if (memcmp(substr, needle + 1, k - 2) == 0) {
return i + j;
}
}
zeros >>= 8;
j += 1;
}
}
return std::string::npos;
}
template <size_t k, typename MEMCMP>
size_t FORCE_INLINE swar64_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {
assert(n > 0);
const uint64_t first = 0x0101010101010101llu * static_cast<uint8_t>(needle[0]);
const uint64_t last = 0x0101010101010101llu * static_cast<uint8_t>(needle[k - 1]);
uint64_t* block_first = reinterpret_cast<uint64_t*>(const_cast<char*>(s));
uint64_t* block_last = reinterpret_cast<uint64_t*>(const_cast<char*>(s + k - 1));
// 2. sequence scan
for (auto i=0u; i < n; i+=8, block_first++, block_last++) {
const uint64_t eq = (*block_first ^ first) | (*block_last ^ last);
const uint64_t t0 = (~eq & 0x7f7f7f7f7f7f7f7fllu) + 0x0101010101010101llu;
const uint64_t t1 = (~eq & 0x8080808080808080llu);
uint64_t zeros = t0 & t1;
size_t j = 0;
while (zeros) {
if (zeros & 0x80) {
const char* substr = reinterpret_cast<char*>(block_first) + j + 1;
if (memcmp_fun(substr, needle + 1)) {
return i + j;
}
}
zeros >>= 8;
j += 1;
}
}
return std::string::npos;
}
// ------------------------------------------------------------------------
size_t swar64_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;
if (n < k) {
return result;
}
switch (k) {
case 0:
return 0;
case 1: {
const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));
return (res != nullptr) ? res - s : std::string::npos;
}
case 2:
result = swar64_strstr_memcmp<2>(s, n, needle, always_true);
break;
case 3:
result = swar64_strstr_memcmp<3>(s, n, needle, memcmp1);
break;
case 4:
result = swar64_strstr_memcmp<4>(s, n, needle, memcmp2);
break;
case 5:
// Note: use memcmp4 rather memcmp3, as the last character
// of needle is already proven to be equal
result = swar64_strstr_memcmp<5>(s, n, needle, memcmp4);
break;
case 6:
result = swar64_strstr_memcmp<6>(s, n, needle, memcmp4);
break;
case 7:
result = swar64_strstr_memcmp<7>(s, n, needle, memcmp5);
break;
case 8:
result = swar64_strstr_memcmp<8>(s, n, needle, memcmp6);
break;
case 9:
// Note: use memcmp8 rather memcmp7 for the same reason as above.
result = swar64_strstr_memcmp<9>(s, n, needle, memcmp8);
break;
case 10:
result = swar64_strstr_memcmp<10>(s, n, needle, memcmp8);
break;
case 11:
result = swar64_strstr_memcmp<11>(s, n, needle, memcmp9);
break;
case 12:
result = swar64_strstr_memcmp<12>(s, n, needle, memcmp10);
break;
default:
result = swar64_strstr_anysize(s, n, needle, k);
break;
}
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}
size_t swar64_strstr_v2(const std::string& s, const std::string& needle) {
return swar64_strstr_v2(s.data(), s.size(), needle.data(), needle.size());
}
================================================
FILE: utils/ansi.cpp
================================================
namespace ansi {
const int RED = 31;
const int GREEN = 32;
const int WHITE = 37;
std::string seq(const std::string& str, int color) {
return "\033[" + std::to_string(color) + "m" + str + "\033[0m";
}
} // namespace ansi
================================================
FILE: utils/avx2.cpp
================================================
namespace avx2 {
union proxy {
__m256i vec;
uint8_t u8[32];
uint16_t u16[16];
};
namespace dump {
void epu16(const __m256i vec) {
proxy p;
p.vec = vec;
for (int i=0; i < 16; i++) {
printf("%04x ", p.u16[i]);
}
putchar('\n');
}
void epu8(const __m256i vec) {
proxy p;
p.vec = vec;
putchar('\'');
for (int i=0; i < 32; i++) {
printf("%02x ", p.u8[i]);
}
putchar('\'');
putchar('\n');
}
} // namespace dump
} // namespace sse
================================================
FILE: utils/avx512.cpp
================================================
namespace avx512 {
union proxy {
__m512i vec;
uint8_t u8[64];
uint16_t u16[32];
};
namespace dump {
void epu16(const __m512i vec) {
proxy p;
p.vec = vec;
for (int i=0; i < 32; i++) {
printf("%04x ", p.u16[i]);
}
putchar('\n');
}
void epu8(const __m512i vec) {
proxy p;
p.vec = vec;
putchar('\'');
for (int i=0; i < 64; i++) {
printf("%02x ", p.u8[i]);
}
putchar('\'');
putchar('\n');
}
} // namespace dump
} // namespace sse
================================================
FILE: utils/bits.cpp
================================================
namespace bits {
template <typename T>
T clear_leftmost_set(const T value) {
assert(value != 0);
return value & (value - 1);
}
template <typename T>
unsigned get_first_bit_set(const T value) {
assert(value != 0);
return __builtin_ctz(value);
}
template <>
unsigned get_first_bit_set<uint64_t>(const uint64_t value) {
assert(value != 0);
return __builtin_ctzl(value);
}
} // namespace bits
================================================
FILE: utils/neon.cpp
================================================
namespace neon {
namespace dump {
void epu8(const uint8x16_t vec) {
uint8_t p[16];
vst1q_u8(p, vec);
putchar('\'');
for (int i=0; i < 16; i++) {
printf("%02x ", p[i]);
}
putchar('\'');
putchar('\n');
}
void epu8(const uint8x8_t vec) {
uint8_t p[8];
vst1_u8(p, vec);
putchar('\'');
for (int i=0; i < 8; i++) {
printf("%02x ", p[i]);
}
putchar('\'');
putchar('\n');
}
} // namespace dump
} // namespace sse
================================================
FILE: utils/sse.cpp
================================================
namespace sse {
template <typename T>
__m128i load(T ptr) {
return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
}
__m128i mask_lower_bytes(size_t n) {
// assert(n < 16)
static const uint8_t mask[32] = {
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
};
return load(mask + 16 - n);
}
__m128i mask_higher_bytes(size_t n) {
// assert(n < 16)
static const uint8_t mask[32] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
};
return load(mask + 16 - n);
}
union proxy {
__m128i vec;
uint8_t u8[16];
uint16_t u16[8];
};
namespace dump {
void epu16(const __m128i vec) {
proxy p;
p.vec = vec;
for (int i=0; i < 8; i++) {
printf("%04x ", p.u16[i]);
}
putchar('\n');
}
void epu8(const __m128i vec) {
proxy p;
p.vec = vec;
putchar('\'');
for (int i=0; i < 16; i++) {
printf("%02x ", p.u8[i]);
}
putchar('\'');
putchar('\n');
}
} // namespace dump
} // namespace sse
gitextract_jd4noqk0/
├── .gitignore
├── LICENSE
├── Makefile
├── README.rst
├── aarch64-strstr-v2.cpp
├── avx2-naive-strstr.cpp
├── avx2-naive-strstr64.cpp
├── avx2-naive-unrolled-strstr.cpp
├── avx2-strstr-v2-clang-specific.cpp
├── avx2-strstr-v2.cpp
├── avx2-strstr.cpp
├── avx512bw-strstr-v2.cpp
├── avx512bw-strstr-v3.cpp
├── avx512f-strstr-v2.cpp
├── avx512f-strstr.cpp
├── common.h
├── data/
│ └── placeholder
├── fixed-memcmp.cpp
├── make_words.sh
├── neon-strstr-v2.cpp
├── original/
│ ├── sse4_strstr-test.py
│ └── sse4_strstr.c
├── results/
│ ├── armv7-32bit-gcc4.9.2.txt
│ ├── armv8-64bit-clang3.8.0.txt
│ ├── bulldozer-fx-8510-gcc4.8.4-sse.txt
│ ├── cascadelake-Gold-5217-gcc-7.4.0-avx512bw.txt
│ ├── haswell-i7-4770-gcc5.4.1-avx2.txt
│ ├── knights-landing-7210-gcc5.3.0-avx512f.txt
│ ├── postprocess.py
│ ├── skylake-i7-6700-gcc5.4.1-avx2.txt
│ ├── skylake-i9-7900-gcc-5.4.1-avx512bw.txt
│ └── westmere-m540-gcc6.2.0-sse4.txt
├── scalar.cpp
├── src/
│ ├── all.h
│ ├── all_procedures.cpp
│ ├── application_base.cpp
│ ├── benchmark.cpp
│ ├── benchmark.h
│ ├── speedup.cpp
│ ├── unittests.cpp
│ └── validate.cpp
├── sse-naive-strstr.cpp
├── sse2-needle4.cpp
├── sse2-strstr.cpp
├── sse4-strstr-unrolled.cpp
├── sse4-strstr.cpp
├── sse4.2-strstr.cpp
├── swar32-strstr-v2.cpp
├── swar64-strstr-v2.cpp
└── utils/
├── ansi.cpp
├── avx2.cpp
├── avx512.cpp
├── bits.cpp
├── neon.cpp
└── sse.cpp
SYMBOL INDEX (154 symbols across 38 files)
FILE: aarch64-strstr-v2.cpp
function aarch64_strstr_v2 (line 101) | size_t aarch64_strstr_v2(const char* s, size_t n, const char* needle, si...
function aarch64_strstr_v2 (line 177) | size_t aarch64_strstr_v2(const std::string& s, const std::string& needle) {
FILE: avx2-naive-strstr.cpp
function avx2_naive_strstr (line 34) | size_t avx2_naive_strstr(const char* s, size_t n, const char* needle, si...
function avx2_naive_strstr (line 53) | size_t avx2_naive_strstr(const std::string& s, const std::string& needle) {
FILE: avx2-naive-strstr64.cpp
function avx2_naive_strstr64 (line 45) | size_t avx2_naive_strstr64(const char* s, size_t n, const char* needle, ...
function avx2_naive_strstr64 (line 64) | size_t avx2_naive_strstr64(const std::string& s, const std::string& need...
FILE: avx2-naive-unrolled-strstr.cpp
function avx2_naive_unrolled_strstr (line 49) | size_t avx2_naive_unrolled_strstr(const char* s, size_t n, const char* n...
function avx2_naive_unrolled_strstr (line 68) | size_t avx2_naive_unrolled_strstr(const std::string& s, const std::strin...
FILE: avx2-strstr-v2-clang-specific.cpp
type inner_loop_aux (line 16) | struct inner_loop_aux
type inner_loop_aux<K, i, false> (line 19) | struct inner_loop_aux<K, i, false> {
type inner_loop_aux<K, i, true> (line 28) | struct inner_loop_aux<K, i, true> {
type inner_loop (line 35) | struct inner_loop {
FILE: avx2-strstr-v2.cpp
function avx2_strstr_v2 (line 122) | size_t avx2_strstr_v2(const char* s, size_t n, const char* needle, size_...
function avx2_strstr_v2 (line 201) | size_t avx2_strstr_v2(const std::string& s, const std::string& needle) {
FILE: avx2-strstr.cpp
function avx2_strstr_long (line 1) | size_t avx2_strstr_long(const char* s, size_t n, const char* neddle, siz...
function avx2_strstr_len4 (line 48) | size_t avx2_strstr_len4(const char* s, size_t n, const char* neddle) {
function avx2_strstr (line 81) | size_t avx2_strstr(const char* s, size_t n, const char* neddle, size_t n...
function avx2_strstr (line 125) | size_t avx2_strstr(const std::string& s, const std::string& neddle) {
FILE: avx512bw-strstr-v2.cpp
function avx512bw_strstr_v2_anysize (line 3) | size_t avx512bw_strstr_v2_anysize(const char* string, size_t n, const ch...
function avx512bw_strstr_v2_memcmp (line 40) | size_t avx512bw_strstr_v2_memcmp(const char* string, size_t n, const cha...
function avx512bw_strstr_v2 (line 77) | size_t avx512bw_strstr_v2(const char* s, size_t n, const char* needle, s...
function avx512bw_strstr_v2 (line 153) | size_t avx512bw_strstr_v2(const std::string& s, const std::string& needl...
FILE: avx512bw-strstr-v3.cpp
function avx512bw_strstr_v3_anysize (line 3) | size_t avx512bw_strstr_v3_anysize(const char* string, size_t n, const ch...
function avx512bw_strstr_v3_memcmp (line 43) | size_t avx512bw_strstr_v3_memcmp(const char* string, size_t n, const cha...
function avx512bw_strstr_v3 (line 83) | size_t avx512bw_strstr_v3(const char* s, size_t n, const char* needle, s...
function avx512bw_strstr_v3 (line 159) | size_t avx512bw_strstr_v3(const std::string& s, const std::string& needl...
FILE: avx512f-strstr-v2.cpp
function __mmask16 (line 3) | __mmask16 FORCE_INLINE zero_byte_mask(const __m512i v) {
function avx512f_strstr_v2_anysize (line 16) | size_t avx512f_strstr_v2_anysize(const char* string, size_t n, const cha...
function avx512f_strstr_v2_memcmp (line 83) | size_t avx512f_strstr_v2_memcmp(const char* string, size_t n, const char...
function avx512f_strstr_v2 (line 132) | size_t avx512f_strstr_v2(const char* s, size_t n, const char* needle, si...
function avx512f_strstr_v2 (line 208) | size_t avx512f_strstr_v2(const std::string& s, const std::string& needle) {
FILE: avx512f-strstr.cpp
function avx512f_strstr_long (line 7) | size_t avx512f_strstr_long(const char* string, size_t n, const char* nee...
function avx512f_strstr_eq4 (line 102) | size_t avx512f_strstr_eq4(const char* string, size_t n, const char* need...
function avx512f_strstr (line 188) | size_t avx512f_strstr(const char* s, size_t n, const char* needle, size_...
function avx512f_strstr (line 231) | size_t avx512f_strstr(const std::string& s, const std::string& needle) {
FILE: fixed-memcmp.cpp
function MAYBE_UNUSED (line 5) | MAYBE_UNUSED
function MAYBE_UNUSED (line 10) | MAYBE_UNUSED
function MAYBE_UNUSED (line 15) | MAYBE_UNUSED
function MAYBE_UNUSED (line 22) | MAYBE_UNUSED
function MAYBE_UNUSED (line 34) | MAYBE_UNUSED
function MAYBE_UNUSED (line 42) | MAYBE_UNUSED
function MAYBE_UNUSED (line 54) | MAYBE_UNUSED
function MAYBE_UNUSED (line 66) | MAYBE_UNUSED
function MAYBE_UNUSED (line 78) | MAYBE_UNUSED
function MAYBE_UNUSED (line 86) | MAYBE_UNUSED
function MAYBE_UNUSED (line 94) | MAYBE_UNUSED
function MAYBE_UNUSED (line 104) | MAYBE_UNUSED
function MAYBE_UNUSED (line 118) | MAYBE_UNUSED
FILE: neon-strstr-v2.cpp
function neon_strstr_v2 (line 160) | size_t neon_strstr_v2(const char* s, size_t n, const char* needle, size_...
function neon_strstr_v2 (line 236) | size_t neon_strstr_v2(const std::string& s, const std::string& needle) {
FILE: original/sse4_strstr-test.py
function time_command (line 16) | def time_command(command):
function time (line 24) | def time(command1, command2, iters=10):
function compare (line 34) | def compare(filename, wordpos, word, wordlen):
FILE: original/sse4_strstr.c
function help (line 431) | void help() {
function main (line 438) | int main(int argc, char* argv[]) {
FILE: results/postprocess.py
function load (line 3) | def load(file):
function main (line 21) | def main():
FILE: scalar.cpp
function strstr_naive (line 4) | size_t strstr_naive(const char * hay, size_t size, const char *needle, s...
FILE: src/all_procedures.cpp
type Procedures (line 10) | struct Procedures {
type Item (line 12) | struct Item {
method Item (line 18) | Item(str_find_fun proc_, const char* name_, char code_, bool builtin...
method Item (line 27) | const Item& operator[](char code) {
method Item (line 18) | Item(str_find_fun proc_, const char* name_, char code_, bool builtin...
function strstr_libc (line 39) | size_t strstr_libc(const char* s, size_t, const char* needle, size_t) {
function Procedures (line 48) | Procedures all_procedures() {
type Item (line 12) | struct Item {
method Item (line 18) | Item(str_find_fun proc_, const char* name_, char code_, bool builtin...
method Item (line 27) | const Item& operator[](char code) {
method Item (line 18) | Item(str_find_fun proc_, const char* name_, char code_, bool builtin...
FILE: src/application_base.cpp
class ApplicationBase (line 1) | class ApplicationBase {
class Error (line 8) | class Error final {
method Error (line 13) | Error(const std::string& msg) : message(msg) {}
method prepare (line 17) | void prepare(const std::string& file_name, const std::string& words_na...
method load_text (line 24) | void load_text(const std::string& path) {
method load_words (line 47) | void load_words(const std::string& path) {
method throw_errno (line 73) | void throw_errno(const std::string& prefix) {
FILE: src/benchmark.cpp
class Application (line 16) | class Application final: public ApplicationBase {
type TestType (line 21) | enum class TestType {
type Parameters (line 27) | struct Parameters {
method Application (line 36) | Application(const Parameters& params)
method print_help (line 308) | static void print_help(const char* progname) {
method measure (line 336) | void measure(T_FIND find, char code) {
method is_enabled (line 346) | bool is_enabled(char proc) const {
method prepare_needle (line 351) | void prepare_needle() {
method prepare_input (line 358) | void prepare_input() {
method prepare (line 383) | void prepare() {
function parse (line 394) | bool parse(int argc, char* argv[], Application::Parameters& p) {
function main (line 437) | int main(int argc, char* argv[]) {
FILE: src/benchmark.h
function rdtsc_overhead_func (line 35) | __attribute__ ((noinline))
FILE: src/speedup.cpp
class Application (line 16) | class Application final: public ApplicationBase {
type Parameters (line 23) | struct Parameters {
method Application (line 31) | Application(const Parameters& params)
method print_help (line 279) | static void print_help(const char* progname) {
method measure (line 298) | void measure(T_FIND find, char code) {
method is_enabled (line 323) | bool is_enabled(char proc) const {
function parse (line 330) | bool parse(int argc, char* argv[], Application::Parameters& p) {
function main (line 361) | int main(int argc, char* argv[]) {
FILE: src/unittests.cpp
function test (line 13) | bool test(const char* name, str_find_fun strstr_function) {
function main (line 54) | int main() {
FILE: src/validate.cpp
class Application (line 18) | class Application final: public ApplicationBase {
method Application (line 21) | Application(const std::string& file_name, const std::string& words_nam...
method run (line 25) | bool run() {
method print_help (line 68) | static void print_help(const char* progname) {
method print_progress (line 80) | void print_progress(size_t pos, size_t n) {
function main (line 88) | int main(int argc, char* argv[]) {
FILE: sse-naive-strstr.cpp
function sse_naive_strstr (line 34) | size_t sse_naive_strstr(const char* s, size_t n, const char* needle, siz...
function sse_naive_strstr (line 53) | size_t sse_naive_strstr(const std::string& s, const std::string& needle) {
FILE: sse2-needle4.cpp
function sse2_strstr_needle4 (line 38) | size_t sse2_strstr_needle4(const char* s, size_t n, const char* needle, ...
function sse2_strstr_needle4 (line 49) | size_t sse2_strstr_needle4(const std::string& s, const std::string& need...
function sse2_strstr_needle4_v2 (line 106) | size_t sse2_strstr_needle4_v2(const char* s, size_t n, const char* needl...
function sse2_strstr_needle4_v2 (line 117) | size_t sse2_strstr_needle4_v2(const std::string& s, const std::string& n...
FILE: sse2-strstr.cpp
function sse2_strstr_v2 (line 74) | size_t sse2_strstr_v2(const char* s, size_t n, const char* needle, size_...
function sse2_strstr_v2 (line 150) | size_t sse2_strstr_v2(const std::string& s, const std::string& needle) {
FILE: sse4-strstr-unrolled.cpp
function sse4_strstr_unrolled_anysize (line 11) | size_t sse4_strstr_unrolled_anysize(const char* s, size_t n, const char*...
function sse4_strstr_unrolled_memcmp (line 55) | size_t sse4_strstr_unrolled_memcmp(const char* s, size_t n, const char* ...
function sse4_strstr_unrolled_max20 (line 98) | size_t sse4_strstr_unrolled_max20(const char* s, size_t n, const char* n...
function sse4_strstr_unrolled_max36 (line 135) | size_t sse4_strstr_unrolled_max36(const char* s, size_t n, const char* n...
function sse4_strstr_unrolled_len3 (line 176) | size_t sse4_strstr_unrolled_len3(const char* s, size_t n, const char* ne...
function sse4_strstr_unrolled_len4 (line 202) | size_t sse4_strstr_unrolled_len4(const char* s, size_t n, const char* ne...
function sse4_strstr_unrolled (line 227) | size_t sse4_strstr_unrolled(const char* s, size_t n, const char* needle,...
function sse4_strstr_unrolled (line 332) | size_t sse4_strstr_unrolled(const std::string& s, const std::string& nee...
FILE: sse4-strstr.cpp
function sse4_strstr_anysize (line 1) | size_t sse4_strstr_anysize(const char* s, size_t n, const char* needle, ...
function sse4_strstr_memcmp (line 36) | size_t sse4_strstr_memcmp(const char* s, size_t n, const char* needle, M...
function sse4_strstr_max20 (line 70) | size_t sse4_strstr_max20(const char* s, size_t n, const char* needle, si...
function sse4_strstr_max36 (line 107) | size_t sse4_strstr_max36(const char* s, size_t n, const char* needle, si...
function sse4_strstr_len3 (line 148) | size_t sse4_strstr_len3(const char* s, size_t n, const char* needle) {
function sse4_strstr_len4 (line 174) | size_t sse4_strstr_len4(const char* s, size_t n, const char* needle) {
function sse4_strstr (line 199) | size_t sse4_strstr(const char* s, size_t n, const char* needle, size_t n...
function sse4_strstr (line 302) | size_t sse4_strstr(const std::string& s, const std::string& needle) {
FILE: sse4.2-strstr.cpp
function sse42_strstr (line 72) | size_t sse42_strstr(const char* s, size_t n, const char* needle, size_t ...
function sse42_strstr (line 148) | size_t sse42_strstr(const std::string& s, const std::string& needle) {
FILE: swar32-strstr-v2.cpp
function swar32_strstr_v2 (line 78) | size_t swar32_strstr_v2(const char* s, size_t n, const char* needle, siz...
function swar32_strstr_v2 (line 156) | size_t swar32_strstr_v2(const std::string& s, const std::string& needle) {
FILE: swar64-strstr-v2.cpp
function swar64_strstr_v2 (line 78) | size_t swar64_strstr_v2(const char* s, size_t n, const char* needle, siz...
function swar64_strstr_v2 (line 156) | size_t swar64_strstr_v2(const std::string& s, const std::string& needle) {
FILE: utils/ansi.cpp
type ansi (line 1) | namespace ansi {
function seq (line 7) | std::string seq(const std::string& str, int color) {
FILE: utils/avx2.cpp
type avx2 (line 1) | namespace avx2 {
type dump (line 10) | namespace dump {
function epu16 (line 12) | void epu16(const __m256i vec) {
function epu8 (line 24) | void epu8(const __m256i vec) {
FILE: utils/avx512.cpp
type avx512 (line 1) | namespace avx512 {
type dump (line 10) | namespace dump {
function epu16 (line 12) | void epu16(const __m512i vec) {
function epu8 (line 24) | void epu8(const __m512i vec) {
FILE: utils/bits.cpp
type bits (line 2) | namespace bits {
function T (line 5) | T clear_leftmost_set(const T value) {
function get_first_bit_set (line 14) | unsigned get_first_bit_set(const T value) {
FILE: utils/neon.cpp
type neon (line 1) | namespace neon {
type dump (line 3) | namespace dump {
function epu8 (line 5) | void epu8(const uint8x16_t vec) {
function epu8 (line 19) | void epu8(const uint8x8_t vec) {
FILE: utils/sse.cpp
type sse (line 1) | namespace sse {
function __m128i (line 4) | __m128i load(T ptr) {
function __m128i (line 9) | __m128i mask_lower_bytes(size_t n) {
function __m128i (line 23) | __m128i mask_higher_bytes(size_t n) {
type dump (line 45) | namespace dump {
function epu16 (line 47) | void epu16(const __m128i vec) {
function epu8 (line 59) | void epu8(const __m128i vec) {
Condensed preview — 55 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (203K chars).
[
{
"path": ".gitignore",
"chars": 383,
"preview": "speedup_sse4\nbenchmark_sse4\nunittests_sse4\nvalidate_sse4\n\nspeedup_avx2\nbenchmark_avx2\nunittests_avx2\nvalidate_avx2\n\nspee"
},
{
"path": "LICENSE",
"chars": 1306,
"preview": "Copyright (c) 2008-2016, Wojciech Muła\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or "
},
{
"path": "Makefile",
"chars": 6178,
"preview": ".PHONY: all clean compile_intel\n\nFLAGS=-std=c++11 -O3 -Wall -Wextra -pedantic -I. $(CXXFLAGS)\nFLAGS_INTEL=$(FLAGS) -DHAV"
},
{
"path": "README.rst",
"chars": 1139,
"preview": "================================================================================\n SIMD-friendly algorithms f"
},
{
"path": "aarch64-strstr-v2.cpp",
"chars": 4890,
"preview": "size_t FORCE_INLINE aarch64_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n assert(k > 0);\n"
},
{
"path": "avx2-naive-strstr.cpp",
"chars": 1660,
"preview": "// Method descibed in https://arxiv.org/pdf/1612.01506.pdf\n//\n// Implementation by Daniel Lemire\n// https://github.com/l"
},
{
"path": "avx2-naive-strstr64.cpp",
"chars": 2303,
"preview": "// Method descibed in https://arxiv.org/pdf/1612.01506.pdf\n//\n// Implementation by Daniel Lemire\n// https://github.com/W"
},
{
"path": "avx2-naive-unrolled-strstr.cpp",
"chars": 2792,
"preview": "// Method described in https://arxiv.org/pdf/1612.01506.pdf\n//\n// Implementation by Daniel Lemire\n\nsize_t FORCE_INLINE a"
},
{
"path": "avx2-strstr-v2-clang-specific.cpp",
"chars": 1409,
"preview": "/*\n The following templates implement the loop, where K is a template parameter.\n\n for (unsigned i=1; i < K; i"
},
{
"path": "avx2-strstr-v2.cpp",
"chars": 5978,
"preview": "// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html\n\nsize_t FORCE_INLINE avx2_strstr"
},
{
"path": "avx2-strstr.cpp",
"chars": 3721,
"preview": "size_t avx2_strstr_long(const char* s, size_t n, const char* neddle, size_t neddle_size) {\n \n assert(neddle_size >"
},
{
"path": "avx512bw-strstr-v2.cpp",
"chars": 4320,
"preview": "// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html\n\nsize_t avx512bw_strstr_v2_anysi"
},
{
"path": "avx512bw-strstr-v3.cpp",
"chars": 4480,
"preview": "// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html\n\nsize_t avx512bw_strstr_v3_anysi"
},
{
"path": "avx512f-strstr-v2.cpp",
"chars": 6321,
"preview": "// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html\n\n__mmask16 FORCE_INLINE zero_byt"
},
{
"path": "avx512f-strstr.cpp",
"chars": 6287,
"preview": "/*\n string - pointer to the string\n n - string length in bytes\n needle - pointer to another string\n n "
},
{
"path": "common.h",
"chars": 279,
"preview": "#pragma once\n\n#define FORCE_INLINE inline __attribute__((always_inline))\n#define MAYBE_UNUSED inline __attribute__((unus"
},
{
"path": "data/placeholder",
"chars": 12,
"preview": "placeholder\n"
},
{
"path": "fixed-memcmp.cpp",
"chars": 3929,
"preview": "// #define USE_SIMPLE_MEMCMP // when defined simpler expressions are used\n\nnamespace {\n\n MAYBE_UNUSED\n bool always"
},
{
"path": "make_words.sh",
"chars": 73,
"preview": "# split words\ncat $1 \\\n | tr -s -c \"a-zA-Z\" \"\\n\" \\\n | sort -u \\\n > $2\n"
},
{
"path": "neon-strstr-v2.cpp",
"chars": 6272,
"preview": "size_t FORCE_INLINE neon_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n assert(k > 0);\n "
},
{
"path": "original/sse4_strstr-test.py",
"chars": 1724,
"preview": "import sys, os, random\n\nfilename = \"<unspecified>\"\ntry:\n\tfilename = sys.argv[1]\n\tstring = open(filename, \"r\").read()\nexc"
},
{
"path": "original/sse4_strstr.c",
"chars": 15071,
"preview": "/*\n\tSSE4 string search --- modification of Karp-Rabin algorithm, $Revision: 1.11 $\n\t\n\tAcceleration of strstr using SSE4 "
},
{
"path": "results/armv7-32bit-gcc4.9.2.txt",
"chars": 2085,
"preview": "./speedup_arm data/i386.txt data/words 1\nstd::strstr ... reference result = 810807651, time "
},
{
"path": "results/armv8-64bit-clang3.8.0.txt",
"chars": 2820,
"preview": "std::strstr ... reference result = 810807651, time = 3.457578 s\nstd::string::find "
},
{
"path": "results/bulldozer-fx-8510-gcc4.8.4-sse.txt",
"chars": 1819,
"preview": "./speedup data/i386.txt data/words \nstd::strstr ... reference result = 8108076510, time = "
},
{
"path": "results/cascadelake-Gold-5217-gcc-7.4.0-avx512bw.txt",
"chars": 4692,
"preview": "./speedup_avx512bw data/i386.txt data/words\nscalar (naive) ... reference result = 8108076510, t"
},
{
"path": "results/haswell-i7-4770-gcc5.4.1-avx2.txt",
"chars": 2403,
"preview": "./speedup_avx2 data/i386.txt data/words \nstd::strstr ... reference result = 8108076510, time"
},
{
"path": "results/knights-landing-7210-gcc5.3.0-avx512f.txt",
"chars": 2979,
"preview": "./speedup_avx512 data/i386.txt data/words \nstd::strstr ... reference result = 8108076510, ti"
},
{
"path": "results/postprocess.py",
"chars": 697,
"preview": "from collections import OrderedDict\n\ndef load(file):\n D = OrderedDict()\n for line in file:\n if 'reference r"
},
{
"path": "results/skylake-i7-6700-gcc5.4.1-avx2.txt",
"chars": 2403,
"preview": "./speedup_avx2 data/i386.txt data/words \nstd::strstr ... reference result = 8108076510, time"
},
{
"path": "results/skylake-i9-7900-gcc-5.4.1-avx512bw.txt",
"chars": 4980,
"preview": "./speedup_avx512bw data/i386.txt data/words \nnaive scalar ... reference result = 8108076510, "
},
{
"path": "results/westmere-m540-gcc6.2.0-sse4.txt",
"chars": 1818,
"preview": "./speedup data/i386.txt data/words \nstd::strstr ... reference result = 8108076510, time = "
},
{
"path": "scalar.cpp",
"chars": 716,
"preview": "// Implementation by Daniel Lemire\n// https://github.com/WojciechMula/sse4-strstr/issues/2\n\nsize_t strstr_naive(const ch"
},
{
"path": "src/all.h",
"chars": 1102,
"preview": "#pragma once\n\n#include \"common.h\"\n#include <utils/bits.cpp>\n#include <errno.h>\n#include \"fixed-memcmp.cpp\"\n#include \"sca"
},
{
"path": "src/all_procedures.cpp",
"chars": 3201,
"preview": "#include \"all.h\"\n\n#include <string>\n#include <vector>\n#include <algorithm>\n#include <stdexcept>\n\nusing str_find_fun = si"
},
{
"path": "src/application_base.cpp",
"chars": 1573,
"preview": "class ApplicationBase {\n\nprotected:\n std::string file;\n std::vector<std::string> words;\n\npublic:\n class Error f"
},
{
"path": "src/benchmark.cpp",
"chars": 11663,
"preview": "#include <cstdio>\n#include <cstdint>\n#include <cassert>\n#include <cstring>\n#include <string>\n#include <vector>\n\n#include"
},
{
"path": "src/benchmark.h",
"chars": 5250,
"preview": "#ifndef _BENCHMARK_H_\n#define _BENCHMARK_H_\n\n#include <stdint.h>\n#define RDTSC_START(cycles) "
},
{
"path": "src/speedup.cpp",
"chars": 9294,
"preview": "#include <cstdio>\n#include <cstdint>\n#include <cassert>\n#include <cstring>\n#include <string>\n#include <vector>\n#include "
},
{
"path": "src/unittests.cpp",
"chars": 1753,
"preview": "#include <cstdio>\n#include <cstdint>\n#include <cassert>\n#include <cstring>\n#include <string>\n#include <vector>\n\n#include"
},
{
"path": "src/validate.cpp",
"chars": 2852,
"preview": "#include <cstdio>\n#include <cstdint>\n#include <cassert>\n#include <cstring>\n#include <string>\n#include <vector>\n\n// -----"
},
{
"path": "sse-naive-strstr.cpp",
"chars": 1639,
"preview": "// Method descibed in https://arxiv.org/pdf/1612.01506.pdf\n//\n// Implementation by Daniel Lemire\n// https://github.com/l"
},
{
"path": "sse2-needle4.cpp",
"chars": 3864,
"preview": "size_t FORCE_INLINE sse2_needle4(const char* s, size_t n, const char* needle, size_t k) {\n\n uint32_t u32;\n memcpy("
},
{
"path": "sse2-strstr.cpp",
"chars": 4251,
"preview": "// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html\n\nsize_t FORCE_INLINE sse2_strstr"
},
{
"path": "sse4-strstr-unrolled.cpp",
"chars": 10077,
"preview": "// Note: it appears that these specialized functions do not help.\n// But I decided to left them, just in case.\n\n//"
},
{
"path": "sse4-strstr.cpp",
"chars": 8636,
"preview": "size_t sse4_strstr_anysize(const char* s, size_t n, const char* needle, size_t needle_size) {\n\n assert(needle_size > "
},
{
"path": "sse4.2-strstr.cpp",
"chars": 3920,
"preview": "/* Usage of PCMPESTRM instruction from SSE 4.1 */\n\nsize_t FORCE_INLINE sse42_strstr_anysize(const char* s, size_t n, con"
},
{
"path": "swar32-strstr-v2.cpp",
"chars": 4653,
"preview": "size_t FORCE_INLINE swar32_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n assert(k > 0);\n "
},
{
"path": "swar64-strstr-v2.cpp",
"chars": 4753,
"preview": "size_t FORCE_INLINE swar64_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n assert(k > 0);\n "
},
{
"path": "utils/ansi.cpp",
"chars": 259,
"preview": "namespace ansi {\n \n const int RED = 31;\n const int GREEN = 32;\n const int WHITE = 37;\n\n std::string seq"
},
{
"path": "utils/avx2.cpp",
"chars": 710,
"preview": "namespace avx2 {\n\n union proxy {\n __m256i vec;\n uint8_t u8[32];\n uint16_t u16[16];\n };\n\n\n "
},
{
"path": "utils/avx512.cpp",
"chars": 712,
"preview": "namespace avx512 {\n\n union proxy {\n __m512i vec;\n uint8_t u8[64];\n uint16_t u16[32];\n };\n\n\n"
},
{
"path": "utils/bits.cpp",
"chars": 486,
"preview": "\nnamespace bits {\n\n template <typename T>\n T clear_leftmost_set(const T value) {\n\n assert(value != 0);\n\n "
},
{
"path": "utils/neon.cpp",
"chars": 680,
"preview": "namespace neon {\n\n namespace dump {\n\n void epu8(const uint8x16_t vec) {\n \n uint8_t p[16]"
},
{
"path": "utils/sse.cpp",
"chars": 1668,
"preview": "namespace sse {\n\n template <typename T>\n __m128i load(T ptr) {\n \n return _mm_loadu_si128(reinterpret"
}
]
About this extraction
This page contains the full source code of the WojciechMula/sse4-strstr GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 55 files (186.5 KB), approximately 59.0k tokens, and a symbol index with 154 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.