[
  {
    "path": ".gitignore",
    "content": "speedup_sse4\nbenchmark_sse4\nunittests_sse4\nvalidate_sse4\n\nspeedup_avx2\nbenchmark_avx2\nunittests_avx2\nvalidate_avx2\n\nspeedup_avx512f\nbenchmark_avx512f\nunittests_avx512f\nvalidate_avx512f\n\nunittests_avx512bw\nbenchmark_avx512bw\nvalidate_avx512bw\nspeedup_avx512bw\n\nspeedup_arm\nunittests_arm\nvalidate_arm\n\nspeedup_aarch64\nunittests_aarch64\nvalidate_aarch64\n\ndata/i386.txt\ndata/words\n\ntags\n"
  },
  {
    "path": "LICENSE",
    "content": "Copyright (c) 2008-2016, Wojciech Muła\nAll rights reserved.\n\nRedistribution and use in source and binary forms, with or without\nmodification, are permitted provided that the following conditions are\nmet:\n\n1. Redistributions of source code must retain the above copyright\n   notice, this list of conditions and the following disclaimer.\n\n2. Redistributions in binary form must reproduce the above copyright\n   notice, this list of conditions and the following disclaimer in the\n   documentation and/or other materials provided with the distribution.\n\nTHIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS \"AS\nIS\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED\nTO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A\nPARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT\nHOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,\nSPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED\nTO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\nPROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF\nLIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING\nNEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS\nSOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n"
  },
  {
    "path": "Makefile",
    "content": ".PHONY: all clean compile_intel\n\nFLAGS=-std=c++11 -O3 -Wall -Wextra -pedantic -I. $(CXXFLAGS)\nFLAGS_INTEL=$(FLAGS) -DHAVE_SSE_INSTRUCTIONS\nFLAGS_SSE4=$(FLAGS_INTEL) -msse4.2\nFLAGS_AVX2=$(FLAGS_INTEL) -mavx2 -DHAVE_AVX2_INSTRUCTIONS\nFLAGS_AVX512F=$(FLAGS_INTEL) -mavx512f -DHAVE_AVX2_INSTRUCTIONS -DHAVE_AVX512F_INSTRUCTIONS\nFLAGS_AVX512BW=$(FLAGS_INTEL) -mavx512bw -DHAVE_AVX2_INSTRUCTIONS -DHAVE_AVX512F_INSTRUCTIONS -DHAVE_AVX512BW_INSTRUCTIONS\nFLAGS_ARM=$(FLAGS) -mfpu=neon -DHAVE_NEON_INSTRUCTIONS\nFLAGS_AARCH64=$(FLAGS) -DHAVE_NEON_INSTRUCTIONS -DHAVE_AARCH64_ARCHITECTURE\n\nDEPS=utils/ansi.cpp utils/bits.cpp common.h fixed-memcmp.cpp\nDEPS_SCALAR=swar64-strstr-v2.cpp swar32-strstr-v2.cpp scalar.cpp\nDEPS_SSE4=sse4-strstr.cpp sse4-strstr-unrolled.cpp sse4.2-strstr.cpp sse2-strstr.cpp sse-naive-strstr.cpp sse2-needle4.cpp utils/sse.cpp $(DEPS) $(DEPS_SCALAR)\nDEPS_AVX2=avx2-*.cpp utils/avx2.cpp $(DEPS_SSE4)\nDEPS_AVX512F=avx512f-*.cpp utils/avx512.cpp $(DEPS_AVX2)\nDEPS_AVX512BW=avx512bw-*.cpp utils/avx512.cpp $(DEPS_AVX512F)\nDEPS_ARM=neon-strstr-v2.cpp $(DEPS) $(DEPS_SCALAR)\nDEPS_AARCH64=aarch64-strstr-v2.cpp $(DEPS_ARM)\n\nALL_INTEL=\\\n    validate_sse4 \\\n    speedup_sse4 \\\n    benchmark_sse4 \\\n    unittests_sse4 \\\n    validate_avx2 \\\n    speedup_avx2 \\\n    benchmark_avx2 \\\n    unittests_avx2 \\\n    validate_avx512f \\\n    speedup_avx512f \\\n    benchmark_avx512f \\\n    unittests_avx512f \\\n    speedup_avx512bw \\\n    benchmark_avx512bw \\\n    validate_avx512bw \\\n    unittests_avx512bw \\\n\nALL_ARM=\\\n    validate_arm \\\n    unittests_arm \\\n    speedup_arm\n\nALL_AARCH64=\\\n    validate_aarch64 \\\n    unittests_aarch64 \\\n    speedup_aarch64\n\nALL=$(ALL_INTEL) $(ALL_ARM) $(ALL_AARCH64)\n\nall:\n\t@echo \"select target test_ARCH or run_ARCH\"\n\t@echo\n\t@echo \"test_ARCH runs unit and validation tests\"\n\t@echo \"run_ARCH  runs performance tests\"\n\t@echo\n\t@echo \"ARCH might be:\"\n\t@echo \"* sse4\"\n\t@echo \"* avx2\"\n\t@echo \"* avx512f\"\n\t@echo \"* avx512bw\"\n\t@echo \"* arm\"\n\t@echo \"* aarch64\"\n\nbuild_intel:   $(ALL_INTEL)\nbuild_arm:     $(ALL_ARM)\nbuild_aarch64: $(ALL_AARCH64)\n\nUNITTESTS_DEPS=src/unittests.cpp src/all_procedures.cpp\nVALIDATE_DEPS=src/validate.cpp src/application_base.cpp src/all_procedures.cpp\nSPEEDUP_DEPS=src/speedup.cpp src/application_base.cpp src/all_procedures.cpp\nBENCHMARK_DEPS=src/benchmark.cpp src/benchmark.h src/application_base.cpp src/all_procedures.cpp\n\nvalidate_sse4: $(VALIDATE_DEPS) $(DEPS_SSE4)\n\t$(CXX) $(FLAGS_SSE4) src/validate.cpp -o $@\n\nspeedup_sse4: $(SPEEDUP_DEPS) $(DEPS_SSE4)\n\t$(CXX) $(FLAGS_SSE4) -DNDEBUG src/speedup.cpp -o $@\n\nbenchmark_sse4: $(BENCHMARK_DEPS) $(DEPS_SSE4)\n\t$(CXX) $(FLAGS_SSE4) -DNDEBUG src/benchmark.cpp -o $@\n\nunittests_sse4: $(UNITTESTS_DEPS) $(DEPS_SSE4)\n\t$(CXX) $(FLAGS_SSE4) src/unittests.cpp -o $@\n\nvalidate_avx2: $(VALIDATE_DEPS) $(DEPS_AVX2)\n\t$(CXX) $(FLAGS_AVX2) src/validate.cpp -o $@\n\nspeedup_avx2: $(SPEEDUP_DEPS) $(DEPS_AVX2)\n\t$(CXX) $(FLAGS_AVX2) -DNDEBUG  src/speedup.cpp -o $@\n\nbenchmark_avx2: $(BENCHMARK_DEPS) $(DEPS_SSE4)\n\t$(CXX) $(FLAGS_AVX2) -DNDEBUG src/benchmark.cpp -o $@\n\nunittests_avx2: $(UNITTESTS_DEPS) $(DEPS_AVX2)\n\t$(CXX) $(FLAGS_AVX2) src/unittests.cpp -o $@\n\nvalidate_avx512f: $(VALIDATE_DEPS) $(DEPS_AVX512F)\n\t$(CXX) $(FLAGS_AVX512F) src/validate.cpp -o $@\n\nbenchmark_avx512f: $(BENCHMARK_DEPS) $(DEPS_SSE4)\n\t$(CXX) $(FLAGS_AVX512F) -DNDEBUG src/benchmark.cpp -o $@\n\nspeedup_avx512f: $(SPEEDUP_DEPS) $(DEPS_AVX512F)\n\t$(CXX) $(FLAGS_AVX512F) -DNDEBUG  src/speedup.cpp -o $@\n\nunittests_avx512f: $(UNITTESTS_DEPS) $(DEPS_AVX512F)\n\t$(CXX) $(FLAGS_AVX512F) src/unittests.cpp -o $@\n\nvalidate_avx512bw: $(VALIDATE_DEPS) $(DEPS_AVX512BW)\n\t$(CXX) $(FLAGS_AVX512BW) src/validate.cpp -o $@\n\nspeedup_avx512bw: $(SPEEDUP_DEPS) $(DEPS_AVX512BW)\n\t$(CXX) $(FLAGS_AVX512BW) -DNDEBUG src/speedup.cpp -o $@\n\nbenchmark_avx512bw: $(BENCHMARK_DEPS) $(DEPS_SSE4)\n\t$(CXX) $(FLAGS_AVX512BW) -DNDEBUG src/benchmark.cpp -o $@\n\nunittests_avx512bw: $(UNITTESTS_DEPS) $(DEPS_AVX512BW)\n\t$(CXX) $(FLAGS_AVX512BW) src/unittests.cpp -o $@\n\nvalidate_arm: $(VALIDATE_DEPS) $(DEPS_ARM)\n\t$(CXX) $(FLAGS_ARM) src/validate.cpp -o $@\n\nspeedup_arm: $(SPEEDUP_DEPS) $(DEPS_ARM)\n\t$(CXX) $(FLAGS_ARM) -DNDEBUG  src/speedup.cpp -o $@\n\nunittests_arm: $(UNITTESTS_DEPS) $(DEPS_ARM)\n\t$(CXX) $(FLAGS_ARM) src/unittests.cpp -o $@\n\nvalidate_aarch64: $(VALIDATE_DEPS) $(DEPS_AARCH64)\n\t$(CXX) $(FLAGS_AARCH64) src/validate.cpp -o $@\n\nspeedup_aarch64: $(SPEEDUP_DEPS) $(DEPS_AARCH64)\n\t$(CXX) $(FLAGS_AARCH64) -DNDEBUG  src/speedup.cpp -o $@\n\nunittests_aarch64: $(UNITTESTS_DEPS) $(DEPS_ARM)\n\t$(CXX) $(FLAGS_AARCH64) src/unittests.cpp -o $@\n\ndata/i386.txt:\n\twget http://css.csail.mit.edu/6.858/2013/readings/i386.txt\n\tmv i386.txt data/i386.txt\n\ndata/words: data/i386.txt\n\tsh make_words.sh $^ $@\n\ntest_sse4: unittests_sse4 validate_sse4 data/words data/i386.txt\n\t./unittests_sse4\n\t./validate_sse4 data/i386.txt data/words\n\nrun_sse4: speedup_sse4 data/words data/i386.txt\n\t./speedup_sse4 data/i386.txt data/words\n\ntest_avx2: unittests_avx2 validate_avx2 data/words data/i386.txt\n\t./unittests_avx2\n\t./validate_avx2 data/i386.txt data/words\n\nrun_avx2: speedup_avx2 data/words data/i386.txt\n\t./speedup_avx2 data/i386.txt data/words\n\ntest_avx512f: unittests_avx512f validate_avx512f data/words data/i386.txt\n\t./unittests_avx512f\n\t./validate_avx512f data/i386.txt data/words\n\nrun_avx512f: speedup_avx512f data/words data/i386.txt\n\t./speedup_avx512f data/i386.txt data/words\n\nrun_avx512bw: speedup_avx512bw data/words data/i386.txt\n\t./speedup_avx512bw data/i386.txt data/words\n\ntest_avx512bw: unittests_avx512bw validate_avx512bw data/words data/i386.txt\n\t./unittests_avx512bw\n\t./validate_avx512bw data/i386.txt data/words\n\ntest_arm: unittests_arm validate_arm data/words data/i386.txt\n\t./unittests_arm\n\t./validate_arm data/i386.txt data/words\n\nrun_arm: speedup_arm data/words data/i386.txt\n\t# my Raspberry Pi is slow, repeat count = 1 is enough\n\t./$< data/i386.txt data/words 1\n\ntest_aarch64: unittests_aarch64 validate_aarch64 data/words data/i386.txt\n\t./unittests_aarch64\n\t./validate_aarch64 data/i386.txt data/words\n\nrun_aarch64: speedup_aarch64 data/words data/i386.txt\n\t./$< data/i386.txt data/words 1\n\ncompile_intel: $(ALL_INTEL)\n\nclean:\n\trm -f $(ALL)\n"
  },
  {
    "path": "README.rst",
    "content": "================================================================================\n             SIMD-friendly algorithms for substring searching\n================================================================================\n\nSample programs for article \"SIMD-friendly algorithms for substring searching\"\n(http://0x80.pl/articles/simd-strfind.html).\n\nThe **root directory** contains C++11 procedures implemented using intrinsics\nfor SSE, SSE4, AVX2, AVX512F, AVX512BW and ARM Neon (both ARMv7 and ARMv8).\n\nThe subdirectory **original** contains 32-bit programs with inline assembly,\nwritten in 2008 for another article__.\n\n__ http://0x80.pl/articles/sse4_substring_locate.html\n\n\nUsage\n------------------------------------------------------------------------\n\nTo run unit and validation tests type ``make test_ARCH``, to run\nperformance tests type ``make run_ARCH``. Value ``ARCH`` selectes\nthe CPU architecture:\n\n* sse4,\n* avx2,\n* avx512f,\n* avx512bw,\n* arm,\n* aarch64.\n\n\nPerformance results\n------------------------------------------------------------------------\n\nThe subdirectory ``results`` contains raw timings from various computers.\n"
  },
  {
    "path": "aarch64-strstr-v2.cpp",
    "content": "size_t FORCE_INLINE aarch64_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    const uint8x16_t first = vdupq_n_u8(needle[0]);\n    const uint8x16_t last  = vdupq_n_u8(needle[k - 1]);\n\n    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(s);\n\n    for (size_t i = 0; i < n; i += 16) {\n\n        const uint8x16_t block_first = vld1q_u8(ptr + i);\n        const uint8x16_t block_last  = vld1q_u8(ptr + i + k - 1);\n\n        const uint8x16_t eq_first = vceqq_u8(first, block_first);\n        const uint8x16_t eq_last  = vceqq_u8(last, block_last);\n        const uint8x16_t pred_16  = vandq_u8(eq_first, eq_last);\n\n        uint64_t mask;\n\n        mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 0);\n        if (mask) {\n            for (int j=0; j < 8; j++) {\n                if ((mask & 0xff) && (memcmp(s + i + j + 1, needle + 1, k - 2) == 0)) {\n                    return i + j;\n                }\n\n                mask >>= 8;\n            }\n        }\n\n        mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 1);\n        if (mask) {\n            for (int j=0; j < 8; j++) {\n                if ((mask & 0xff) && (memcmp(s + i + j + 8 + 1, needle + 1, k - 2) == 0)) {\n                    return i + j + 8;\n                }\n\n                mask >>= 8;\n            }\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\ntemplate <size_t k, typename MEMCMP>\nsize_t FORCE_INLINE aarch64_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    const uint8x16_t first = vdupq_n_u8(needle[0]);\n    const uint8x16_t last  = vdupq_n_u8(needle[k - 1]);\n\n    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(s);\n\n    for (size_t i = 0; i < n; i += 16) {\n\n        const uint8x16_t block_first = vld1q_u8(ptr + i);\n        const uint8x16_t block_last  = vld1q_u8(ptr + i + k - 1);\n\n        const uint8x16_t eq_first = vceqq_u8(first, block_first);\n        const uint8x16_t eq_last  = vceqq_u8(last, block_last);\n        const uint8x16_t pred_16  = vandq_u8(eq_first, eq_last);\n\n        uint64_t mask;\n        int j;\n\n        mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 0);\n        j = 0;\n        while (mask) {\n            if ((mask & 0xff) && (memcmp_fun(s + i + j + 1, needle + 1))) {\n                return i + j;\n            }\n\n            mask >>= 8;\n            j += 1;\n        }\n\n        mask = vgetq_lane_u64(vreinterpretq_u64_u8(pred_16), 1);\n        j = 0;\n        while (mask) {\n            if ((mask & 0xff) && (memcmp_fun(s + i + j + 8 + 1, needle + 1))) {\n                return i + j + 8;\n            }\n\n            mask >>= 8;\n            j += 1;\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t aarch64_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tswitch (k) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\n        case 2:\n            result = aarch64_strstr_memcmp<2>(s, n, needle, always_true);\n            break;\n\n        case 3:\n            result = aarch64_strstr_memcmp<3>(s, n, needle, memcmp1);\n            break;\n\n        case 4:\n            result = aarch64_strstr_memcmp<4>(s, n, needle, memcmp2);\n            break;\n\n        case 5:\n            result = aarch64_strstr_memcmp<5>(s, n, needle, memcmp4);\n            break;\n\n        case 6:\n            result = aarch64_strstr_memcmp<6>(s, n, needle, memcmp4);\n            break;\n\n        case 7:\n            result = aarch64_strstr_memcmp<7>(s, n, needle, memcmp5);\n            break;\n\n        case 8:\n            result = aarch64_strstr_memcmp<8>(s, n, needle, memcmp6);\n            break;\n\n        case 9:\n            result = aarch64_strstr_memcmp<9>(s, n, needle, memcmp8);\n            break;\n\n        case 10:\n            result = aarch64_strstr_memcmp<10>(s, n, needle, memcmp8);\n            break;\n\n        case 11:\n            result = aarch64_strstr_memcmp<11>(s, n, needle, memcmp9);\n            break;\n\n        case 12:\n            result = aarch64_strstr_memcmp<12>(s, n, needle, memcmp10);\n            break;\n\n\t\tdefault:\n\t\t\tresult = aarch64_strstr_anysize(s, n, needle, k);\n            break;\n    }\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// ------------------------------------------------------------------------\n\nsize_t aarch64_strstr_v2(const std::string& s, const std::string& needle) {\n\n    return aarch64_strstr_v2(s.data(), s.size(), needle.data(), needle.size());\n}\n\n\n"
  },
  {
    "path": "avx2-naive-strstr.cpp",
    "content": "// Method descibed in https://arxiv.org/pdf/1612.01506.pdf\n//\n// Implementation by Daniel Lemire\n// https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/extra/simd/substring/substring.c\n\nsize_t FORCE_INLINE avx2_naive_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    if (n == k) {\n        return (memcmp(s, needle, k) == 0) ? 0 : std::string::npos;\n    }\n\n    for (size_t i = 0; i < n - k + 1; i += 32) {\n        uint32_t found = 0xffffffff;\n        for (size_t j = 0; (j < k) && (found != 0) ; ++j) {\n            const __m256i textvector = _mm256_loadu_si256((const __m256i *)(s + i + j));\n            const __m256i needlevector = _mm256_set1_epi8(needle[j]);\n            uint32_t bitmask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(textvector, needlevector));\n            found = found & bitmask;\n        }\n        if (found != 0) {\n            return i + __builtin_ctz(found);\n        }\n    }\n\n    return std::string::npos;\n}\n\n\n// ------------------------------------------------------------------------\n\nsize_t avx2_naive_strstr(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tresult = avx2_naive_strstr_anysize(s, n, needle, k);\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// ------------------------------------------------------------------------\n\nsize_t avx2_naive_strstr(const std::string& s, const std::string& needle) {\n\n    return avx2_naive_strstr(s.data(), s.size(), needle.data(), needle.size());\n}\n\n\n"
  },
  {
    "path": "avx2-naive-strstr64.cpp",
    "content": "// Method descibed in https://arxiv.org/pdf/1612.01506.pdf\n//\n// Implementation by Daniel Lemire\n// https://github.com/WojciechMula/sse4-strstr/issues/2\n\nsize_t FORCE_INLINE avx2_naive_strstr_anysize64(const char* s, size_t n, const char* needle, size_t k) {\n\n    assert(k > 0);\n    assert(n > 0);\n    const __m256i first = _mm256_set1_epi8(needle[0]);\n    const __m256i last  = _mm256_set1_epi8(needle[k - 1]);\n    for (size_t i = 0; i < n; i += 64) {\n\n        const __m256i block_first1 = _mm256_loadu_si256((const __m256i*)(s + i));\n        const __m256i block_last1  = _mm256_loadu_si256((const __m256i*)(s + i + k - 1));\n\n        const __m256i block_first2 = _mm256_loadu_si256((const __m256i*)(s + i + 32));\n        const __m256i block_last2  = _mm256_loadu_si256((const __m256i*)(s + i + k - 1 + 32));\n\n        const __m256i eq_first1 = _mm256_cmpeq_epi8(first, block_first1);\n        const __m256i eq_last1  = _mm256_cmpeq_epi8(last, block_last1);\n\n        const __m256i eq_first2 = _mm256_cmpeq_epi8(first, block_first2);\n        const __m256i eq_last2  = _mm256_cmpeq_epi8(last, block_last2);\n\n        const uint32_t mask1 = _mm256_movemask_epi8(_mm256_and_si256(eq_first1, eq_last1));\n        const uint32_t mask2 = _mm256_movemask_epi8(_mm256_and_si256(eq_first2, eq_last2));\n        uint64_t mask = mask1 | ((uint64_t)mask2 << 32);\n\n        while (mask != 0) {\n            const int bitpos = __builtin_ctzll(mask);\n            if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) {\n                return i + bitpos;\n            }\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n\n// ------------------------------------------------------------------------\n\nsize_t avx2_naive_strstr64(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tresult = avx2_naive_strstr_anysize64(s, n, needle, k);\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// ------------------------------------------------------------------------\n\nsize_t avx2_naive_strstr64(const std::string& s, const std::string& needle) {\n\n    return avx2_naive_strstr64(s.data(), s.size(), needle.data(), needle.size());\n}\n\n\n"
  },
  {
    "path": "avx2-naive-unrolled-strstr.cpp",
    "content": "// Method described in https://arxiv.org/pdf/1612.01506.pdf\n//\n// Implementation by Daniel Lemire\n\nsize_t FORCE_INLINE avx2_naive_strstr_unrolled_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n    // assert(n % 32 == 0); // deliberately commented out\n    // todo: fix it so we can handle variable-length inputs and\n    // can catch matches at the end of the data.\n    for (size_t i = 0; i < n - k; i += 32) {\n      uint32_t found = 0xFFFFFFFF; // 32 1-bits\n      size_t j = 0;\n      for (; (j + 3 < k) && (found != 0)  ; j += 4) {\n        __m256i textvector1 = _mm256_loadu_si256((const __m256i *)(s + i + j));\n        __m256i needlevector1 = _mm256_set1_epi8(needle[j]);\n        __m256i textvector2 = _mm256_loadu_si256((const __m256i *)(s + i + j + 1));\n        __m256i needlevector2 = _mm256_set1_epi8(needle[j + 1]);\n        __m256i cmp1 = _mm256_cmpeq_epi8(textvector1, needlevector1);\n        __m256i cmp2 = _mm256_cmpeq_epi8(textvector2, needlevector2);\n        __m256i textvector3 = _mm256_loadu_si256((const __m256i *)(s + i + j + 2));\n        __m256i needlevector3 = _mm256_set1_epi8(needle[j + 2]);\n        __m256i textvector4 = _mm256_loadu_si256((const __m256i *)(s + i + j + 3));\n        __m256i needlevector4 = _mm256_set1_epi8(needle[j + 3]);\n        __m256i cmp3 = _mm256_cmpeq_epi8(textvector3, needlevector3);\n        __m256i cmp4 = _mm256_cmpeq_epi8(textvector4, needlevector4);\n        __m256i cmp12 = _mm256_and_si256(cmp1,cmp2);\n        __m256i cmp34 = _mm256_and_si256(cmp3,cmp4);\n        uint32_t bitmask = _mm256_movemask_epi8(_mm256_and_si256(cmp12,cmp34));\n        found = found & bitmask;\n      }\n      for (; (j < k) && (found != 0) ; ++j) {\n        __m256i textvector = _mm256_loadu_si256((const __m256i *)(s + i + j));\n        __m256i needlevector = _mm256_set1_epi8(needle[j]);\n        uint32_t bitmask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(textvector, needlevector));\n        found = found & bitmask;\n      }\n      if(found != 0) {\n        // got a match... maybe\n        return i + __builtin_ctz(found);\n      }\n    }\n\n    return std::string::npos;\n}\n\n\n// ------------------------------------------------------------------------\n\nsize_t avx2_naive_unrolled_strstr(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tresult = avx2_naive_strstr_unrolled_anysize(s, n, needle, k);\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// ------------------------------------------------------------------------\n\nsize_t avx2_naive_unrolled_strstr(const std::string& s, const std::string& needle) {\n\n    return avx2_naive_unrolled_strstr(s.data(), s.size(), needle.data(), needle.size());\n}\n\n\n"
  },
  {
    "path": "avx2-strstr-v2-clang-specific.cpp",
    "content": "/*\n    The following templates implement the loop, where K is a template parameter.\n\n        for (unsigned i=1; i < K; i++) {\n            const __m256i substring = _mm256_alignr_epi8(next1, curr, i);\n            eq = _mm256_and_si256(eq, _mm256_cmpeq_epi8(substring, broadcasted[i]));\n        }\n\n    Clang complains that the loop parameter `i` is a variable and it cannot be\n    applied as a parameter _mm256_alignr_epi8.  GCC somehow deals with it.\n*/\n\n#ifdef __clang__\n\ntemplate <size_t K, int i, bool terminate>\nstruct inner_loop_aux;\n\ntemplate <size_t K, int i>\nstruct inner_loop_aux<K, i, false> {\n    void operator()(__m256i& eq, const __m256i& next1, const __m256i& curr, const __m256i (&broadcasted)[K]) {\n        const __m256i substring = _mm256_alignr_epi8(next1, curr, i);\n        eq = _mm256_and_si256(eq, _mm256_cmpeq_epi8(substring, broadcasted[i]));\n        inner_loop_aux<K, i + 1, i + 1 == K>()(eq, next1, curr, broadcasted);\n    }\n};\n\ntemplate <size_t K, int i>\nstruct inner_loop_aux<K, i, true> {\n    void operator()(__m256i&, const __m256i&, const __m256i&, const __m256i (&)[K]) {\n        // nop\n    }\n};\n\ntemplate <size_t K>\nstruct inner_loop {\n    void operator()(__m256i& eq, const __m256i& next1, const __m256i& curr, const __m256i (&broadcasted)[K]) {\n        static_assert(K > 0, \"wrong value\");\n        inner_loop_aux<K, 0, false>()(eq, next1, curr, broadcasted);\n    }\n};\n\n#endif\n"
  },
  {
    "path": "avx2-strstr-v2.cpp",
    "content": "// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html\n\nsize_t FORCE_INLINE avx2_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    const __m256i first = _mm256_set1_epi8(needle[0]);\n    const __m256i last  = _mm256_set1_epi8(needle[k - 1]);\n\n    for (size_t i = 0; i < n; i += 32) {\n\n        const __m256i block_first = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i));\n        const __m256i block_last  = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i + k - 1));\n\n        const __m256i eq_first = _mm256_cmpeq_epi8(first, block_first);\n        const __m256i eq_last  = _mm256_cmpeq_epi8(last, block_last);\n\n        uint32_t mask = _mm256_movemask_epi8(_mm256_and_si256(eq_first, eq_last));\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask);\n\n            if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) {\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n#include \"avx2-strstr-v2-clang-specific.cpp\"\n\ntemplate <size_t K>\nsize_t FORCE_INLINE avx2_strstr_eq(const char* s, size_t n, const char* needle) {\n\n    static_assert(K > 0 && K < 16, \"K must be in range [1..15]\");\n    assert(n > 0);\n\n    __m256i broadcasted[K];\n    for (unsigned i=0; i < K; i++) {\n        broadcasted[i] = _mm256_set1_epi8(needle[i]);\n    }\n\n    __m256i curr = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s));\n\n    for (size_t i = 0; i < n; i += 32) {\n\n        const __m256i next = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i + 32));\n\n        __m256i eq = _mm256_cmpeq_epi8(curr, broadcasted[0]);\n\n        // AVX2 palignr works on 128-bit lanes, thus some extra work is needed\n        //\n        // curr = [a, b] (2 x 128 bit)\n        // next = [c, d]\n        // substring = [palignr(b, a, i), palignr(c, b, i)]\n        __m256i next1;\n        next1 = _mm256_inserti128_si256(next1, _mm256_extracti128_si256(curr, 1), 0); // b\n        next1 = _mm256_inserti128_si256(next1, _mm256_extracti128_si256(next, 0), 1); // c\n\n#ifndef __clang__\n        for (unsigned i=1; i < K; i++) {\n            const __m256i substring = _mm256_alignr_epi8(next1, curr, i);\n            eq = _mm256_and_si256(eq, _mm256_cmpeq_epi8(substring, broadcasted[i]));\n        }\n#else\n        inner_loop<K>()(eq, next1, curr, broadcasted);\n#endif\n\n        curr = next;\n\n        const uint32_t mask = _mm256_movemask_epi8(eq);\n        if (mask != 0) {\n            return i + bits::get_first_bit_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\ntemplate <size_t k, typename MEMCMP>\nsize_t FORCE_INLINE avx2_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    const __m256i first = _mm256_set1_epi8(needle[0]);\n    const __m256i last  = _mm256_set1_epi8(needle[k - 1]);\n\n    for (size_t i = 0; i < n; i += 32) {\n\n        const __m256i block_first = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i));\n        const __m256i block_last  = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i + k - 1));\n\n        const __m256i eq_first = _mm256_cmpeq_epi8(first, block_first);\n        const __m256i eq_last  = _mm256_cmpeq_epi8(last, block_last);\n\n        uint32_t mask = _mm256_movemask_epi8(_mm256_and_si256(eq_first, eq_last));\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask);\n\n            if (memcmp_fun(s + i + bitpos + 1, needle + 1)) {\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t avx2_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tswitch (k) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\n        case 2:\n            result = avx2_strstr_eq<2>(s, n, needle);\n            break;\n\n        case 3:\n            result = avx2_strstr_memcmp<3>(s, n, needle, memcmp1);\n            break;\n\n        case 4:\n            result = avx2_strstr_memcmp<4>(s, n, needle, memcmp2);\n            break;\n\n        case 5:\n            // Note: use memcmp4 rather memcmp3, as the last character\n            //       of needle is already proven to be equal\n            result = avx2_strstr_memcmp<5>(s, n, needle, memcmp4);\n            break;\n\n        case 6:\n            result = avx2_strstr_memcmp<6>(s, n, needle, memcmp4);\n            break;\n\n        case 7:\n            result = avx2_strstr_memcmp<7>(s, n, needle, memcmp5);\n            break;\n\n        case 8:\n            result = avx2_strstr_memcmp<8>(s, n, needle, memcmp6);\n            break;\n\n        case 9:\n            // Note: use memcmp8 rather memcmp7 for the same reason as above.\n            result = avx2_strstr_memcmp<9>(s, n, needle, memcmp8);\n            break;\n\n        case 10:\n            result = avx2_strstr_memcmp<10>(s, n, needle, memcmp8);\n            break;\n\n        case 11:\n            result = avx2_strstr_memcmp<11>(s, n, needle, memcmp9);\n            break;\n\n        case 12:\n            result = avx2_strstr_memcmp<12>(s, n, needle, memcmp10);\n            break;\n\n\t\tdefault:\n\t\t\tresult = avx2_strstr_anysize(s, n, needle, k);\n            break;\n    }\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// ------------------------------------------------------------------------\n\nsize_t avx2_strstr_v2(const std::string& s, const std::string& needle) {\n\n    return avx2_strstr_v2(s.data(), s.size(), needle.data(), needle.size());\n}\n\n\n"
  },
  {
    "path": "avx2-strstr.cpp",
    "content": "size_t avx2_strstr_long(const char* s, size_t n, const char* neddle, size_t neddle_size) {\n    \n    assert(neddle_size > 4);\n    assert(n > 0);\n\n    const uint32_t prefix32 = *reinterpret_cast<const uint32_t*>(neddle);\n    const __m256i prefix = _mm256_set1_epi32(prefix32);\n    const __m256i zeros  = _mm256_setzero_si256();\n\n    const __m256i permute = _mm256_setr_epi32(\n        0, 1, 2, 0,\n        2, 3, 4, 0\n    );\n\n    for (size_t i = 0; i < n; i += 16) {\n        \n        const __m256i in     = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i));\n        /*\n            [00|01|02|03|04|05|06|07|08|09|10|11|12|13|14|15|16|17|18|19|20|21|22|23|24|25|26|27|28|29|30|31]\n                                                       lane | boundary\n            [00|01|02|03|04|05|06|07|08|09|10|11|??|??|??|??|08|09|10|11|12|13|14|15|16|17|18|19|??|??|??|??]\n             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n        */\n        const __m256i data   = _mm256_permutevar8x32_epi32(in, permute);\n        const __m256i result = _mm256_mpsadbw_epu8(data, prefix, 0);\n\n        const __m256i cmp    = _mm256_cmpeq_epi16(result, zeros);\n\n        uint32_t mask = _mm256_movemask_epi8(cmp) & 0x55555555u;\n\n        while (mask != 0) {\n\n            const auto bitpos   = bits::get_first_bit_set(mask)/2;\n\n            if (memcmp(s + i + bitpos + 4, neddle + 4, neddle_size - 4) == 0) {\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t avx2_strstr_len4(const char* s, size_t n, const char* neddle) {\n\n    assert(n > 0);\n\n    const uint32_t prefix32 = *reinterpret_cast<const uint32_t*>(neddle);\n    const __m256i prefix = _mm256_set1_epi32(prefix32);\n    const __m256i zeros  = _mm256_setzero_si256();\n\n    const __m256i permute = _mm256_setr_epi32(\n        0, 1, 2, 0,\n        2, 3, 4, 0\n    );\n\n    for (size_t i = 0; i < n; i += 16) {\n        \n        const __m256i in     = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(s + i));\n        const __m256i data   = _mm256_permutevar8x32_epi32(in, permute);\n        const __m256i result = _mm256_mpsadbw_epu8(data, prefix, 0);\n\n        const __m256i cmp    = _mm256_cmpeq_epi16(result, zeros);\n\n        const uint32_t mask = _mm256_movemask_epi8(cmp) & 0x55555555u;\n\n        if (mask != 0) {\n            return i + bits::get_first_bit_set(mask)/2;\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t avx2_strstr(const char* s, size_t n, const char* neddle, size_t neddle_size) {\n\n    size_t result = std::string::npos;\n\n    if (n < neddle_size) {\n        return result;\n    }\n\n\tswitch (neddle_size) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, neddle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\t\tcase 2:\n\t\tcase 3:\n            {\n\t\t\tconst char* res = reinterpret_cast<const char*>(strstr(s, neddle));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\n\t\tcase 4:\n\t\t\tresult = avx2_strstr_len4(s, n, neddle);\n            break;\n\n\t\tdefault:\n\t\t\tresult = avx2_strstr_long(s, n, neddle, neddle_size);\n            break;\n    }\n\n\n    if (result <= n - neddle_size) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// --------------------------------------------------\n\nsize_t avx2_strstr(const std::string& s, const std::string& neddle) {\n\n    return avx2_strstr(s.data(), s.size(), neddle.data(), neddle.size());\n}\n\n"
  },
  {
    "path": "avx512bw-strstr-v2.cpp",
    "content": "// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html\n\nsize_t avx512bw_strstr_v2_anysize(const char* string, size_t n, const char* needle, size_t k) {\n\n    assert(n > 0);\n    assert(k > 0);\n\n    const __m512i first = _mm512_set1_epi8(needle[0]);\n    const __m512i last  = _mm512_set1_epi8(needle[k - 1]);\n\n    char* haystack = const_cast<char*>(string);\n    char* end      = haystack + n;\n\n    for (/**/; haystack < end; haystack += 64) {\n\n        const __m512i block_first = _mm512_loadu_si512(haystack + 0);\n        const __m512i block_last  = _mm512_loadu_si512(haystack + k - 1);\n\n        uint64_t mask = _mm512_cmpeq_epi8_mask(block_first, first)\n                      & _mm512_cmpeq_epi8_mask(block_last, last);\n\n        while (mask != 0) {\n\n            const uint64_t bitpos = bits::get_first_bit_set(mask);\n            const char* s = reinterpret_cast<const char*>(haystack);\n\n            if (memcmp(s + bitpos + 1, needle + 1, k - 2) == 0) {\n                return (s - string) + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return size_t(-1);\n}\n\n\ntemplate <size_t k, typename MEMCMP>\nsize_t avx512bw_strstr_v2_memcmp(const char* string, size_t n, const char* needle, MEMCMP memeq_fun) {\n\n    assert(n > 0);\n    assert(k > 0);\n\n    const __m512i first = _mm512_set1_epi8(needle[0]);\n    const __m512i last  = _mm512_set1_epi8(needle[k - 1]);\n\n    char* haystack = const_cast<char*>(string);\n    char* end      = haystack + n;\n\n    for (/**/; haystack < end; haystack += 64) {\n\n        const __m512i block_first = _mm512_loadu_si512(haystack + 0);\n        const __m512i block_last  = _mm512_loadu_si512(haystack + k - 1);\n\n        uint64_t mask = _mm512_cmpeq_epi8_mask(block_first, first)\n                      & _mm512_cmpeq_epi8_mask(block_last, last);\n\n        while (mask != 0) {\n\n            const uint64_t bitpos = bits::get_first_bit_set(mask);\n            const char* s = reinterpret_cast<const char*>(haystack);\n\n            if (memeq_fun(s + bitpos + 1, needle + 1)) {\n                return (s - string) + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return size_t(-1);\n}\n\n// ------------------------------------------------------------------------\n\nsize_t avx512bw_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tswitch (k) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\n        case 2:\n            result = avx512bw_strstr_v2_memcmp<2>(s, n, needle, always_true);\n            break;\n\n        case 3:\n            result = avx512bw_strstr_v2_memcmp<3>(s, n, needle, memcmp1);\n            break;\n\n        case 4:\n            result = avx512bw_strstr_v2_memcmp<4>(s, n, needle, memcmp2);\n            break;\n\n        case 5:\n            result = avx512bw_strstr_v2_memcmp<5>(s, n, needle, memcmp3);\n            break;\n\n        case 6:\n            result = avx512bw_strstr_v2_memcmp<6>(s, n, needle, memcmp4);\n            break;\n\n        case 7:\n            result = avx512bw_strstr_v2_memcmp<7>(s, n, needle, memcmp5);\n            break;\n\n        case 8:\n            result = avx512bw_strstr_v2_memcmp<8>(s, n, needle, memcmp6);\n            break;\n\n        case 9:\n            result = avx512bw_strstr_v2_memcmp<9>(s, n, needle, memcmp7);\n            break;\n\n        case 10:\n            result = avx512bw_strstr_v2_memcmp<10>(s, n, needle, memcmp8);\n            break;\n\n        case 11:\n            result = avx512bw_strstr_v2_memcmp<11>(s, n, needle, memcmp9);\n            break;\n\n        case 12:\n            result = avx512bw_strstr_v2_memcmp<12>(s, n, needle, memcmp10);\n            break;\n\n\t\tdefault:\n\t\t\tresult = avx512bw_strstr_v2_anysize(s, n, needle, k);\n            break;\n    }\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// --------------------------------------------------\n\nsize_t avx512bw_strstr_v2(const std::string& s, const std::string& needle) {\n\n    return avx512bw_strstr_v2(s.data(), s.size(), needle.data(), needle.size());\n}\n\n"
  },
  {
    "path": "avx512bw-strstr-v3.cpp",
    "content": "// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html\n\nsize_t avx512bw_strstr_v3_anysize(const char* string, size_t n, const char* needle, size_t k) {\n\n    assert(n > 0);\n    assert(k > 0);\n\n    const __m512i first = _mm512_set1_epi8(needle[0]);\n    const __m512i last  = _mm512_set1_epi8(needle[k - 1]);\n\n    char* haystack = const_cast<char*>(string);\n    char* end      = haystack + n;\n\n    for (/**/; haystack < end; haystack += 64) {\n\n        const __m512i   block_first = _mm512_loadu_si512(haystack + 0);\n        const __mmask64 first_eq    = _mm512_cmpeq_epi8_mask(block_first, first);\n\n        if (first_eq == 0)\n            continue;\n\n        const __m512i block_last  = _mm512_loadu_si512(haystack + k - 1);\n        uint64_t mask = _mm512_mask_cmpeq_epi8_mask(first_eq, block_last, last);\n\n        while (mask != 0) {\n\n            const uint64_t bitpos = bits::get_first_bit_set(mask);\n            const char* s = reinterpret_cast<const char*>(haystack);\n\n            if (memcmp(s + bitpos + 1, needle + 1, k - 2) == 0) {\n                return (s - string) + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return size_t(-1);\n}\n\n\ntemplate <size_t k, typename MEMCMP>\nsize_t avx512bw_strstr_v3_memcmp(const char* string, size_t n, const char* needle, MEMCMP memeq_fun) {\n\n    assert(n > 0);\n    assert(k > 0);\n\n    const __m512i first = _mm512_set1_epi8(needle[0]);\n    const __m512i last  = _mm512_set1_epi8(needle[k - 1]);\n\n    char* haystack = const_cast<char*>(string);\n    char* end      = haystack + n;\n\n    for (/**/; haystack < end; haystack += 64) {\n\n        const __m512i block_first = _mm512_loadu_si512(haystack + 0);\n        const __mmask64 first_eq  = _mm512_cmpeq_epi8_mask(block_first, first);\n\n        if (first_eq == 0)\n            continue;\n\n        const __m512i block_last  = _mm512_loadu_si512(haystack + k - 1);\n        uint64_t mask = _mm512_mask_cmpeq_epi8_mask(first_eq, block_last, last);\n\n        while (mask != 0) {\n\n            const uint64_t bitpos = bits::get_first_bit_set(mask);\n            const char* s = reinterpret_cast<const char*>(haystack);\n\n            if (memeq_fun(s + bitpos + 1, needle + 1)) {\n                return (s - string) + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return size_t(-1);\n}\n\n// ------------------------------------------------------------------------\n\nsize_t avx512bw_strstr_v3(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tswitch (k) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\n        case 2:\n            result = avx512bw_strstr_v3_memcmp<2>(s, n, needle, always_true);\n            break;\n\n        case 3:\n            result = avx512bw_strstr_v3_memcmp<3>(s, n, needle, memcmp1);\n            break;\n\n        case 4:\n            result = avx512bw_strstr_v3_memcmp<4>(s, n, needle, memcmp2);\n            break;\n\n        case 5:\n            result = avx512bw_strstr_v3_memcmp<5>(s, n, needle, memcmp3);\n            break;\n\n        case 6:\n            result = avx512bw_strstr_v3_memcmp<6>(s, n, needle, memcmp4);\n            break;\n\n        case 7:\n            result = avx512bw_strstr_v3_memcmp<7>(s, n, needle, memcmp5);\n            break;\n\n        case 8:\n            result = avx512bw_strstr_v3_memcmp<8>(s, n, needle, memcmp6);\n            break;\n\n        case 9:\n            result = avx512bw_strstr_v3_memcmp<9>(s, n, needle, memcmp7);\n            break;\n\n        case 10:\n            result = avx512bw_strstr_v3_memcmp<10>(s, n, needle, memcmp8);\n            break;\n\n        case 11:\n            result = avx512bw_strstr_v3_memcmp<11>(s, n, needle, memcmp9);\n            break;\n\n        case 12:\n            result = avx512bw_strstr_v3_memcmp<12>(s, n, needle, memcmp10);\n            break;\n\n\t\tdefault:\n\t\t\tresult = avx512bw_strstr_v3_anysize(s, n, needle, k);\n            break;\n    }\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// --------------------------------------------------\n\nsize_t avx512bw_strstr_v3(const std::string& s, const std::string& needle) {\n\n    return avx512bw_strstr_v3(s.data(), s.size(), needle.data(), needle.size());\n}\n\n"
  },
  {
    "path": "avx512f-strstr-v2.cpp",
    "content": "// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html\n\n__mmask16 FORCE_INLINE zero_byte_mask(const __m512i v) {\n\n    const __m512i v01  = _mm512_set1_epi8(0x01);\n    const __m512i v80  = _mm512_set1_epi8(int8_t(0x80));\n\n    const __m512i v1   = _mm512_sub_epi32(v, v01);\n    // tmp1 = (v - 0x01010101) & ~v & 0x80808080\n    const __m512i tmp1 = _mm512_ternarylogic_epi32(v1, v, v80, 0x20);\n\n    return _mm512_test_epi32_mask(tmp1, tmp1);\n}\n\n\nsize_t avx512f_strstr_v2_anysize(const char* string, size_t n, const char* needle, size_t k) {\n\n    assert(n > 0);\n    assert(k > 0);\n\n    const __m512i first = _mm512_set1_epi8(needle[0]);\n    const __m512i last  = _mm512_set1_epi8(needle[k - 1]);\n\n    char* haystack = const_cast<char*>(string);\n    char* end      = haystack + n;\n\n    for (/**/; haystack < end; haystack += 64) {\n\n        const __m512i block_first = _mm512_loadu_si512(haystack + 0);\n        const __m512i block_last  = _mm512_loadu_si512(haystack + k - 1);\n\n#if 0\n        const __m512i first_zeros = _mm512_xor_si512(block_first, first);\n        const __m512i last_zeros  = _mm512_xor_si512(block_last, last);\n        const __m512i zeros       = _mm512_or_si512(first_zeros, last_zeros);\n#else\n        const __m512i first_zeros = _mm512_xor_si512(block_first, first);\n        /*\n            first_zeros | block_last | last |  first_zeros | (block_last ^ last)\n            ------------+------------+------+------------------------------------\n                 0      |      0     |   0  |      0\n                 0      |      0     |   1  |      1\n                 0      |      1     |   0  |      1\n                 0      |      1     |   1  |      0\n                 1      |      0     |   0  |      1\n                 1      |      0     |   1  |      1\n                 1      |      1     |   0  |      1\n                 1      |      1     |   1  |      1\n        */\n        const __m512i zeros       = _mm512_ternarylogic_epi32(first_zeros, block_last, last, 0xf6);\n#endif\n\n        uint32_t mask = zero_byte_mask(zeros);\n        while (mask) {\n\n            const uint64_t p = __builtin_ctz(mask);\n\n            if (memcmp(haystack + 4*p + 0, needle, k) == 0) {\n                return (haystack - string) + 4*p + 0;\n            }\n\n            if (memcmp(haystack + 4*p + 1, needle, k) == 0) {\n                return (haystack - string) + 4*p + 1;\n            }\n\n            if (memcmp(haystack + 4*p + 2, needle, k) == 0) {\n                return (haystack - string) + 4*p + 2;\n            }\n\n            if (memcmp(haystack + 4*p + 3, needle, k) == 0) {\n                return (haystack - string) + 4*p + 3;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return size_t(-1);\n}\n\n\ntemplate <size_t k, typename MEMCMP>\nsize_t avx512f_strstr_v2_memcmp(const char* string, size_t n, const char* needle, MEMCMP memeq_fun) {\n\n    assert(n > 0);\n    assert(k > 0);\n\n    const __m512i first = _mm512_set1_epi8(needle[0]);\n    const __m512i last  = _mm512_set1_epi8(needle[k - 1]);\n\n    char* haystack = const_cast<char*>(string);\n    char* end      = haystack + n;\n\n    for (/**/; haystack < end; haystack += 64) {\n\n        const __m512i block_first = _mm512_loadu_si512(haystack + 0);\n        const __m512i block_last  = _mm512_loadu_si512(haystack + k - 1);\n\n        const __m512i first_zeros = _mm512_xor_si512(block_first, first);\n        const __m512i zeros       = _mm512_ternarylogic_epi32(first_zeros, block_last, last, 0xf6);\n\n        uint32_t mask = zero_byte_mask(zeros);\n        while (mask) {\n\n            const uint64_t p = __builtin_ctz(mask);\n\n            if (memeq_fun(haystack + 4*p + 0, needle)) {\n                return (haystack - string) + 4*p + 0;\n            }\n\n            if (memeq_fun(haystack + 4*p + 1, needle)) {\n                return (haystack - string) + 4*p + 1;\n            }\n\n            if (memeq_fun(haystack + 4*p + 2, needle)) {\n                return (haystack - string) + 4*p + 2;\n            }\n\n            if (memeq_fun(haystack + 4*p + 3, needle)) {\n                return (haystack - string) + 4*p + 3;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return size_t(-1);\n}\n\n// ------------------------------------------------------------------------\n\nsize_t avx512f_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tswitch (k) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\n        case 2:\n            result = avx512f_strstr_v2_memcmp<2>(s, n, needle, memcmp2);\n            break;\n\n        case 3:\n            result = avx512f_strstr_v2_memcmp<3>(s, n, needle, memcmp3);\n            break;\n\n        case 4:\n            result = avx512f_strstr_v2_memcmp<4>(s, n, needle, memcmp4);\n            break;\n\n        case 5:\n            result = avx512f_strstr_v2_memcmp<5>(s, n, needle, memcmp5);\n            break;\n\n        case 6:\n            result = avx512f_strstr_v2_memcmp<6>(s, n, needle, memcmp6);\n            break;\n\n        case 7:\n            result = avx512f_strstr_v2_memcmp<7>(s, n, needle, memcmp7);\n            break;\n\n        case 8:\n            result = avx512f_strstr_v2_memcmp<8>(s, n, needle, memcmp8);\n            break;\n\n        case 9:\n            result = avx512f_strstr_v2_memcmp<9>(s, n, needle, memcmp9);\n            break;\n\n        case 10:\n            result = avx512f_strstr_v2_memcmp<10>(s, n, needle, memcmp10);\n            break;\n\n        case 11:\n            result = avx512f_strstr_v2_memcmp<11>(s, n, needle, memcmp11);\n            break;\n\n        case 12:\n            result = avx512f_strstr_v2_memcmp<12>(s, n, needle, memcmp12);\n            break;\n\n\t\tdefault:\n\t\t\tresult = avx512f_strstr_v2_anysize(s, n, needle, k);\n            break;\n    }\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// --------------------------------------------------\n\nsize_t avx512f_strstr_v2(const std::string& s, const std::string& needle) {\n\n    return avx512f_strstr_v2(s.data(), s.size(), needle.data(), needle.size());\n}\n\n"
  },
  {
    "path": "avx512f-strstr.cpp",
    "content": "/*\n    string - pointer to the string\n    n      - string length in bytes\n    needle - pointer to another string\n    n      - needle length in bytes\n*/\nsize_t avx512f_strstr_long(const char* string, size_t n, const char* needle, size_t k) {\n\n    assert(n > 0);\n    assert(k > 4);\n\n    __m512i curr;\n    __m512i next;\n    __m512i v0, v1, v2, v3;\n\n    char* haystack = const_cast<char*>(string);\n    char* last     = haystack + n;\n\n    const uint32_t prf   = *(uint32_t*)needle; // the first 4 bytes of needle\n    const __m512i prefix = _mm512_set1_epi32(prf);\n\n    next = _mm512_loadu_si512(haystack);\n\n    for (/**/; haystack < last; haystack += 64) {\n\n        curr = next;\n        next = _mm512_loadu_si512(haystack + 64);\n        const __m512i shft = _mm512_alignr_epi32(next, curr, 1);\n\n        v0 = curr;\n\n        {\n            const __m512i t1 = _mm512_srli_epi32(curr, 8);\n            const __m512i t2 = _mm512_slli_epi32(shft, 24);\n            v1 = _mm512_or_si512(t1, t2);\n        }\n        {\n            const __m512i t1 = _mm512_srli_epi32(curr, 16);\n            const __m512i t2 = _mm512_slli_epi32(shft, 16);\n            v2 = _mm512_or_si512(t1, t2);\n        }\n        {\n            const __m512i t1 = _mm512_srli_epi32(curr, 24);\n            const __m512i t2 = _mm512_slli_epi32(shft, 8);\n            v3 = _mm512_or_si512(t1, t2);\n        }\n\n        uint16_t m0 = _mm512_cmpeq_epi32_mask(v0, prefix);\n        uint16_t m1 = _mm512_cmpeq_epi32_mask(v1, prefix);\n        uint16_t m2 = _mm512_cmpeq_epi32_mask(v2, prefix);\n        uint16_t m3 = _mm512_cmpeq_epi32_mask(v3, prefix);\n\n        int index = 64;\n        while (m0 | m1 | m2 | m3) {\n            if (m0) {\n                int pos = __builtin_ctz(m0) * 4 + 0;\n                m0 = m0 & (m0 - 1);\n\n                if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) {\n                    index = pos;\n                }\n            }\n\n            if (m1) {\n                int pos = __builtin_ctz(m1) * 4 + 1;\n                m1 = m1 & (m1 - 1);\n\n                if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) {\n                    index = pos;\n                }\n            }\n\n            if (m2) {\n                int pos = __builtin_ctz(m2) * 4 + 2;\n                m2 = m2 & (m2 - 1);\n\n                if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) {\n                    index = pos;\n                }\n            }\n\n            if (m3) {\n                int pos = __builtin_ctz(m3) * 4 + 3;\n                m3 = m3 & (m3 - 1);\n\n                if (pos < index && memcmp(haystack + pos + 4, needle + 4, k - 4) == 0) {\n                    index = pos;\n                }\n            }\n        }\n\n        if (index < 64) {\n            return (haystack - string) + index;\n        }\n    }\n\n    return size_t(-1);\n}\n\n// ------------------------------------------------------------------------\n\nsize_t avx512f_strstr_eq4(const char* string, size_t n, const char* needle) {\n\n    assert(n > 0);\n\n    __m512i curr;\n    __m512i next;\n    __m512i v0, v1, v2, v3;\n\n    char* haystack = const_cast<char*>(string);\n    char* last     = haystack + n;\n\n    const uint32_t prf   = *(uint32_t*)needle; // the first 4 bytes of needle\n    const __m512i prefix = _mm512_set1_epi32(prf);\n\n    next = _mm512_loadu_si512(haystack);\n\n    for (/**/; haystack < last; haystack += 64) {\n\n        curr = next;\n        next = _mm512_loadu_si512(haystack + 64);\n        const __m512i shft = _mm512_alignr_epi32(next, curr, 1);\n\n        v0 = curr;\n\n        {\n            const __m512i t1 = _mm512_srli_epi32(curr, 8);\n            const __m512i t2 = _mm512_slli_epi32(shft, 24);\n            v1 = _mm512_or_si512(t1, t2);\n        }\n        {\n            const __m512i t1 = _mm512_srli_epi32(curr, 16);\n            const __m512i t2 = _mm512_slli_epi32(shft, 16);\n            v2 = _mm512_or_si512(t1, t2);\n        }\n        {\n            const __m512i t1 = _mm512_srli_epi32(curr, 24);\n            const __m512i t2 = _mm512_slli_epi32(shft, 8);\n            v3 = _mm512_or_si512(t1, t2);\n        }\n\n        uint16_t m0 = _mm512_cmpeq_epi32_mask(v0, prefix);\n        uint16_t m1 = _mm512_cmpeq_epi32_mask(v1, prefix);\n        uint16_t m2 = _mm512_cmpeq_epi32_mask(v2, prefix);\n        uint16_t m3 = _mm512_cmpeq_epi32_mask(v3, prefix);\n\n        int index = 64;\n        if (m0) {\n            int pos = __builtin_ctz(m0) * 4 + 0;\n            if (pos < index) {\n                index = pos;\n            }\n        }\n\n        if (m1) {\n            int pos = __builtin_ctz(m1) * 4 + 1;\n            if (pos < index) {\n                index = pos;\n            }\n        }\n\n        if (m2) {\n            int pos = __builtin_ctz(m2) * 4 + 2;\n            if (pos < index) {\n                index = pos;\n            }\n        }\n\n        if (m3) {\n            int pos = __builtin_ctz(m3) * 4 + 3;\n            if (pos < index) {\n                index = pos;\n            }\n        }\n\n        if (index < 64) {\n            return (haystack - string) + index;\n        }\n\n        assert(m0 == 0 && m1 == 0 && m2 == 0 && m3 == 0);\n    }\n\n    return size_t(-1);\n}\n\n// ------------------------------------------------------------------------\n\nsize_t avx512f_strstr(const char* s, size_t n, const char* needle, size_t needle_size) {\n\n    size_t result = std::string::npos;\n\n    if (n < needle_size) {\n        return result;\n    }\n\n\tswitch (needle_size) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\t\tcase 2:\n\t\tcase 3: {\n\t\t\tconst char* res = reinterpret_cast<const char*>(strstr(s, needle));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\n\t\tcase 4:\n\t\t\tresult = avx512f_strstr_eq4(s, n, needle);\n            break;\n\n\t\tdefault:\n\t\t\tresult = avx512f_strstr_long(s, n, needle, needle_size);\n            break;\n    }\n\n\n    if (result <= n - needle_size) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// --------------------------------------------------\n\nsize_t avx512f_strstr(const std::string& s, const std::string& needle) {\n\n    return avx512f_strstr(s.data(), s.size(), needle.data(), needle.size());\n}\n"
  },
  {
    "path": "common.h",
    "content": "#pragma once\n\n#define FORCE_INLINE inline __attribute__((always_inline))\n#define MAYBE_UNUSED inline __attribute__((unused))\n\n#if defined(HAVE_NEON_INSTRUCTIONS)\n#   include <arm_neon.h>\n#   define USE_SIMPLE_MEMCMP // for fixed-memcmp.cpp\n#else\n#   include <immintrin.h>\n#endif\n"
  },
  {
    "path": "data/placeholder",
    "content": "placeholder\n"
  },
  {
    "path": "fixed-memcmp.cpp",
    "content": "// #define USE_SIMPLE_MEMCMP // when defined simpler expressions are used\n\nnamespace {\n\n    MAYBE_UNUSED\n    bool always_true(const char*, const char*) {\n        return true;\n    }\n\n    MAYBE_UNUSED\n    bool memcmp1(const char* a, const char* b) {\n        return a[0] == b[0];\n    }\n\n    MAYBE_UNUSED\n    bool memcmp2(const char* a, const char* b) {\n        const uint16_t A = *reinterpret_cast<const uint16_t*>(a);\n        const uint16_t B = *reinterpret_cast<const uint16_t*>(b);\n        return A == B;\n    }\n\n    MAYBE_UNUSED\n    bool memcmp3(const char* a, const char* b) {\n\n#ifdef USE_SIMPLE_MEMCMP\n        return memcmp2(a, b) && memcmp1(a + 2, b + 2);\n#else\n        const uint32_t A = *reinterpret_cast<const uint32_t*>(a);\n        const uint32_t B = *reinterpret_cast<const uint32_t*>(b);\n        return (A & 0x00ffffff) == (B & 0x00ffffff);\n#endif\n    }\n\n    MAYBE_UNUSED\n    bool memcmp4(const char* a, const char* b) {\n\n        const uint32_t A = *reinterpret_cast<const uint32_t*>(a);\n        const uint32_t B = *reinterpret_cast<const uint32_t*>(b);\n        return A == B;\n    }\n\n    MAYBE_UNUSED\n    bool memcmp5(const char* a, const char* b) {\n\n#ifdef USE_SIMPLE_MEMCMP\n        return memcmp4(a, b) && memcmp1(a + 4, b + 4);\n#else\n        const uint64_t A = *reinterpret_cast<const uint64_t*>(a);\n        const uint64_t B = *reinterpret_cast<const uint64_t*>(b);\n        return ((A ^ B) & 0x000000fffffffffflu) == 0;\n#endif\n    }\n\n    MAYBE_UNUSED\n    bool memcmp6(const char* a, const char* b) {\n\n#ifdef USE_SIMPLE_MEMCMP\n        return memcmp4(a, b) && memcmp2(a + 4, b + 4);\n#else\n        const uint64_t A = *reinterpret_cast<const uint64_t*>(a);\n        const uint64_t B = *reinterpret_cast<const uint64_t*>(b);\n        return ((A ^ B) & 0x0000fffffffffffflu) == 0;\n#endif\n    }\n\n    MAYBE_UNUSED\n    bool memcmp7(const char* a, const char* b) {\n\n#ifdef USE_SIMPLE_MEMCMP \n        return memcmp4(a, b) && memcmp3(a + 4, b + 4);\n#else\n        const uint64_t A = *reinterpret_cast<const uint64_t*>(a);\n        const uint64_t B = *reinterpret_cast<const uint64_t*>(b);\n        return ((A ^ B) & 0x00fffffffffffffflu) == 0;\n#endif\n    }\n\n    MAYBE_UNUSED\n    bool memcmp8(const char* a, const char* b) {\n\n        const uint64_t A = *reinterpret_cast<const uint64_t*>(a);\n        const uint64_t B = *reinterpret_cast<const uint64_t*>(b);\n        return A == B;\n    }\n\n    MAYBE_UNUSED\n    bool memcmp9(const char* a, const char* b) {\n\n        const uint64_t A = *reinterpret_cast<const uint64_t*>(a);\n        const uint64_t B = *reinterpret_cast<const uint64_t*>(b);\n        return (A == B) & (a[8] == b[8]);\n    }\n\n    MAYBE_UNUSED\n    bool memcmp10(const char* a, const char* b) {\n\n        const uint64_t Aq = *reinterpret_cast<const uint64_t*>(a);\n        const uint64_t Bq = *reinterpret_cast<const uint64_t*>(b);\n        const uint16_t Aw = *reinterpret_cast<const uint16_t*>(a + 8);\n        const uint16_t Bw = *reinterpret_cast<const uint16_t*>(b + 8);\n        return (Aq == Bq) & (Aw == Bw);\n    }\n\n    MAYBE_UNUSED\n    bool memcmp11(const char* a, const char* b) {\n\n#ifdef USE_SIMPLE_MEMCMP\n        return memcmp8(a, b) && memcmp3(a + 8, b + 8);\n#else\n        const uint64_t Aq = *reinterpret_cast<const uint64_t*>(a);\n        const uint64_t Bq = *reinterpret_cast<const uint64_t*>(b);\n        const uint32_t Ad = *reinterpret_cast<const uint32_t*>(a + 8);\n        const uint32_t Bd = *reinterpret_cast<const uint32_t*>(b + 8);\n        return (Aq == Bq) & ((Ad & 0x00ffffff) == (Bd & 0x00ffffff));\n#endif\n    }\n\n    MAYBE_UNUSED\n    bool memcmp12(const char* a, const char* b) {\n\n        const uint64_t Aq = *reinterpret_cast<const uint64_t*>(a);\n        const uint64_t Bq = *reinterpret_cast<const uint64_t*>(b);\n        const uint32_t Ad = *reinterpret_cast<const uint32_t*>(a + 8);\n        const uint32_t Bd = *reinterpret_cast<const uint32_t*>(b + 8);\n        return (Aq == Bq) & (Ad == Bd);\n    }\n\n}\n\n"
  },
  {
    "path": "make_words.sh",
    "content": "# split words\ncat $1 \\\n  | tr -s -c \"a-zA-Z\" \"\\n\" \\\n  | sort -u \\\n  > $2\n"
  },
  {
    "path": "neon-strstr-v2.cpp",
    "content": "size_t FORCE_INLINE neon_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    const uint8x16_t first = vdupq_n_u8(needle[0]);\n    const uint8x16_t last  = vdupq_n_u8(needle[k - 1]);\n    const uint8x8_t  half  = vdup_n_u8(0x0f);\n\n    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(s);\n\n    union {\n        uint8_t  tmp[8];\n        uint32_t word[2];\n    };\n\n    for (size_t i = 0; i < n; i += 16) {\n\n        const uint8x16_t block_first = vld1q_u8(ptr + i);\n        const uint8x16_t block_last  = vld1q_u8(ptr + i + k - 1);\n\n        const uint8x16_t eq_first = vceqq_u8(first, block_first);\n        const uint8x16_t eq_last  = vceqq_u8(last, block_last);\n        const uint8x16_t pred_16  = vandq_u8(eq_first, eq_last);\n        const uint8x8_t pred_8    = vbsl_u8(half, vget_low_u8(pred_16), vget_high_u8(pred_16));\n\n        vst1_u8(tmp, pred_8);\n\n        if ((word[0] | word[1]) == 0) {\n            continue;\n        }\n\n#if 0\n        for (int j=0; j < 8; j++) {\n            if ((tmp[j] & 0x0f) && (memcmp(s + i + j + 1, needle + 1, k - 2) == 0)) {\n                return i + j;\n            }\n        }\n\n        for (int j=0; j < 8; j++) {\n            if ((tmp[j] & 0xf0) && (memcmp(s + i + j + 1 + 8, needle + 1, k - 2) == 0)) {\n                return i + j + 8;\n            }\n        }\n#else\n        // the above loops unrolled\n        uint32_t v;\n\n#define RETURN_IF_EQ(MASK, SHIFT) \\\n        if ((v & MASK) && memcmp(s + i + SHIFT + 1, needle + 1, k - 2) == 0) { \\\n            return i + SHIFT; \\\n        }\n\n#define COMPARE(MASK, WORD_IDX, SHIFT) \\\n        v = word[WORD_IDX];      \\\n        RETURN_IF_EQ(MASK, SHIFT + 0); \\\n        v >>= 8; \\\n        RETURN_IF_EQ(MASK, SHIFT + 1); \\\n        v >>= 8; \\\n        RETURN_IF_EQ(MASK, SHIFT + 2); \\\n        v >>= 8; \\\n        RETURN_IF_EQ(MASK, SHIFT + 3);\n\n        COMPARE(0x0f, 0,  0);\n        COMPARE(0x0f, 1,  4);\n        COMPARE(0xf0, 0,  8);\n        COMPARE(0xf0, 1, 12);\n\n#undef RETURN_IF_EQ\n#undef COMPARE\n\n#endif\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\ntemplate <size_t k, typename MEMCMP>\nsize_t FORCE_INLINE neon_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    const uint8x16_t first = vdupq_n_u8(needle[0]);\n    const uint8x16_t last  = vdupq_n_u8(needle[k - 1]);\n    const uint8x8_t  half  = vdup_n_u8(0x0f);\n\n    const uint8_t* ptr = reinterpret_cast<const uint8_t*>(s);\n\n    union {\n        uint8_t  tmp[8];\n        uint32_t word[2];\n    };\n\n    for (size_t i = 0; i < n; i += 16) {\n\n        const uint8x16_t block_first = vld1q_u8(ptr + i);\n        const uint8x16_t block_last  = vld1q_u8(ptr + i + k - 1);\n\n        const uint8x16_t eq_first = vceqq_u8(first, block_first);\n        const uint8x16_t eq_last  = vceqq_u8(last, block_last);\n        const uint8x16_t pred_16  = vandq_u8(eq_first, eq_last);\n        const uint8x8_t pred_8    = vbsl_u8(half, vget_low_u8(pred_16), vget_high_u8(pred_16));\n\n        vst1_u8(tmp, pred_8);\n\n        if ((word[0] | word[1]) == 0) {\n            continue;\n        }\n\n#if 0\n        for (int j=0; j < 8; j++) {\n            if ((tmp[j] & 0x0f) && memcmp_fun(s + i + j + 1, needle + 1)) {\n                return i + j;\n            }\n        }\n\n        for (int j=0; j < 8; j++) {\n            if ((tmp[j] & 0xf0) && memcmp_fun(s + i + j + 1 + 8, needle + 1)) {\n                return i + j + 8;\n            }\n        }\n#else\n        // the above loops unrolled\n        uint32_t v;\n\n#define RETURN_IF_EQ(MASK, SHIFT) \\\n        if ((v & MASK) && memcmp_fun(s + i + SHIFT + 1, needle + 1)) { \\\n            return i + SHIFT; \\\n        }\n\n#define COMPARE(MASK, WORD_IDX, SHIFT) \\\n        v = word[WORD_IDX];      \\\n        RETURN_IF_EQ(MASK, SHIFT + 0); \\\n        v >>= 8; \\\n        RETURN_IF_EQ(MASK, SHIFT + 1); \\\n        v >>= 8; \\\n        RETURN_IF_EQ(MASK, SHIFT + 2); \\\n        v >>= 8; \\\n        RETURN_IF_EQ(MASK, SHIFT + 3);\n\n        COMPARE(0x0f, 0,  0);\n        COMPARE(0x0f, 1,  4);\n        COMPARE(0xf0, 0,  8);\n        COMPARE(0xf0, 1, 12);\n\n#undef RETURN_IF_EQ\n#undef COMPARE\n\n#endif\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t neon_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tswitch (k) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\n        case 2:\n            result = neon_strstr_memcmp<2>(s, n, needle, always_true);\n            break;\n\n        case 3:\n            result = neon_strstr_memcmp<3>(s, n, needle, memcmp1);\n            break;\n\n        case 4:\n            result = neon_strstr_memcmp<4>(s, n, needle, memcmp2);\n            break;\n\n        case 5:\n            result = neon_strstr_memcmp<5>(s, n, needle, memcmp4);\n            break;\n\n        case 6:\n            result = neon_strstr_memcmp<6>(s, n, needle, memcmp4);\n            break;\n\n        case 7:\n            result = neon_strstr_memcmp<7>(s, n, needle, memcmp5);\n            break;\n\n        case 8:\n            result = neon_strstr_memcmp<8>(s, n, needle, memcmp6);\n            break;\n\n        case 9:\n            result = neon_strstr_memcmp<9>(s, n, needle, memcmp8);\n            break;\n\n        case 10:\n            result = neon_strstr_memcmp<10>(s, n, needle, memcmp8);\n            break;\n\n        case 11:\n            result = neon_strstr_memcmp<11>(s, n, needle, memcmp9);\n            break;\n\n        case 12:\n            result = neon_strstr_memcmp<12>(s, n, needle, memcmp10);\n            break;\n\n\t\tdefault:\n\t\t\tresult = neon_strstr_anysize(s, n, needle, k);\n            break;\n    }\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// ------------------------------------------------------------------------\n\nsize_t neon_strstr_v2(const std::string& s, const std::string& needle) {\n\n    return neon_strstr_v2(s.data(), s.size(), needle.data(), needle.size());\n}\n\n\n"
  },
  {
    "path": "original/sse4_strstr-test.py",
    "content": "import sys, os, random\n\nfilename = \"<unspecified>\"\ntry:\n\tfilename = sys.argv[1]\n\tstring = open(filename, \"r\").read()\nexcept:\n\tprint \"can't open '%s'\" % filename\n\tsys.exit(1)\n\ntry:\n\trandom.seed(int(sys.argv[3]))\nexcept:\n\tpass\n\ndef time_command(command):\n\tos.system('/usr/bin/time -o /tmp/measure -f \"%U\" ' + command)\n\tf = open(\"/tmp/measure\", \"r\")\n\tt = float(f.read())\n\tf.close()\n\treturn t\n\n\ndef time(command1, command2, iters=10):\n\twhile True:\n\t\tt1 = time_command(command1.replace(\"__iters__\", str(iters)))\n\t\tif t1 > 1:\n\t\t\tt2 = time_command(command2.replace(\"__iters__\", str(iters)))\n\t\t\treturn iters, t1, t2\n\t\telse:\n\t\t\titers *= 10\n\n\ndef compare(filename, wordpos, word, wordlen):\n\tword = word.replace(\"%\", \"%%\")\n\tcmd1 = './a.out \"%s\" libc __iters__ \"%s\" > /dev/null' % (filename, word)\n\tcmd2 = './a.out \"%s\" sse4 __iters__ \"%s\" > /dev/null' % (filename, word)\n\t_, t1, t2 = time(cmd1, cmd2)\n\n\treturn \"[%d,%d] libc=%0.3fs sse4=%0.3fs speedup=%0.2f\" % (wordpos, wordlen, t1, t2, t1/t2)\n\n\nlogname   = \"sse4.log\"\nlognumber = 1\nwhile True:\n\tif not os.path.exists(logname):\n\t\tlog = open(logname, \"w\")\n\t\tbreak\n\telse:\n\t\tlogname = \"sse4%d.log\" % lognumber\n\t\tlognumber += 1\n\n\ntry:\n\tfor n in xrange(4, 64):\n\t\ti1 = random.randint(   0, 64)\n\t\ti2 = random.randint(  65, 1024)\n\t\ti3 = random.randint(1024, len(string)-n)\n\t\tprint \"length\", n\n\t\tfor i in [i1, i2, i3]:\n\t\t\tword = string[i:i+n]\n\t\t\tfor c in \"\\\\`()<>{}\\\"\":\n\t\t\t\tword = word.replace(c, \"\\\\\" + c)\n\t\t\t\n\t\t\tcmd = './a.out \"%s\" verify 1 \"%s\"' % (filename, word)\n\t\t\terr = os.system(cmd)\n\t\t\tif err:\n\t\t\t\tprint repr(string[i:i+l])\n\t\t\t\tsys.exit(1)\n\t\t\telse:\n\t\t\t\ts = compare(filename, i, word, n)\n\t\t\t\tlog.write(s + \"\\n\")\n\t\t\t\tprint s\nexcept:\n\timport traceback\n\ttraceback.print_exc()\n\tlog.close()\n"
  },
  {
    "path": "original/sse4_strstr.c",
    "content": "/*\n\tSSE4 string search --- modification of Karp-Rabin algorithm, $Revision: 1.11 $\n\t\n\tAcceleration of strstr using SSE4 instruction MPSADBW.\n\tThis program includes one wrapper sse4_strstr around\n\tfollowing functions:\n\n\t* sse4_strstr_any - exact comparison is done with built-in\n\t  function strncmp.c\n\t* sse4_strstr_len3, see4_strstr_len4 - optimized\n\t  for substring of length 3 and 4 chars, no additional comparison\n\t  is needed\n\t* sse4_strstr_max20, sse4_strstr_max36 - optimized\n\t  for substring of length 4..20 and 20..36, exact comparision\n\t  is done with few assebler instructions\n\n\n\tAuthor: Wojciech Mua\n\te-mail: wojciech_mula@poczta.onet.pl\n\twww:    http://0x80.pl/\n\t\n\tLicense: BSD\n\t\n\tinitial release 27-05-2008, last update $Date: 2008-06-08 23:00:44 $\n*/\n\n#include <stdint.h>\n#include <stdlib.h>\n#include <stdio.h>\n#include <string.h>\n#include <strings.h>\n\nstatic uint8_t mask[][16] = {\n\t{0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},\n\t{0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},\n\t{0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},\n\t{0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},\n\t{0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},\n\t{0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},\n\t{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},\n\t{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00},\n\t{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00},\n\t{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00},\n\t{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00},\n\t{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00},\n\t{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00},\n\t{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00},\n\t{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00},\n\t{0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff},\n};\n\nchar* sse4_strstr_any(char* s1, int n1, char* s2, int n2);\nchar* sse4_strstr_len3(char* s1, int n1, char* s2, int n2);\nchar* sse4_strstr_len4(char* s1, int n1, char* s2, int n2);\nchar* sse4_strstr_max20(char* s1, int n1, char* s2, int n2);\nchar* sse4_strstr_max36(char* s1, int n1, char* s2, int n2);\n\n\nchar* sse4_strstr(char* s1, int n1, char* s2, int n2) {\n\tswitch (n1) {\n\t\tcase 0:\n\t\t\treturn NULL;\n\t\tcase 1:\n\t\t\treturn strchr(s2, s1[1]);\n\t\tcase 2:\n\t\t\treturn strstr(s2, s1);\n\t\tcase 3:\n\t\t\treturn sse4_strstr_len3(s1, n1, s2, n2);\n\t\tcase 4:\n\t\t\treturn sse4_strstr_len4(s1, n1, s2, n2);\n\t\tcase 5: case 6: case 7: case 8: case 9:\n\t\tcase 10: case 11: case 12: case 13: case 14:\n\t\tcase 15: case 16: case 17: case 18: case 19:\n\t\tcase 20: /* 5..20 */\n\t\t\treturn sse4_strstr_max20(s1, n1, s2, n2);\n\t\tcase 21: case 22: case 23: case 24: case 25: \n\t\tcase 26: case 27: case 28: case 29: case 30: \n\t\tcase 31: case 32: case 33: case 34: case 35: \n\t\tcase 36: /* 21..36 */\n\t\t\treturn sse4_strstr_max36(s1, n1, s2, n2);\n\t\tdefault:\n\t\t\treturn sse4_strstr_any(s1, n1, s2, n2);\n\n\t}\n}\n\n\nchar* sse4_strstr_any(char* s1, int n1, char* s2, int n2) {\n\t// n1 > 4, n2 > 4\n\tchar* result;\n\tuint32_t dummy __attribute__((unused));\n\t\n\t__asm__ volatile (\"movdqu (%%eax), %%xmm1\" : : \"a\" (s1));\n\t__asm__ volatile (\"pxor    %%xmm0, %%xmm0\" : : );\n\t__asm__ volatile (\n\t\t/*** initialization ****************************************************/\n\t\t// we have to save 3 registers: eax, ecx and edx\n\t\t// also strncmp needs three arguments, thus esp -= (3+3)*4 = \n\t\t\"\taddl   $-24, %%esp\t\t\\n\"\n\n\t\t// function strncmp is invoke with argument s1+4, s2+4, n1-4 -- s1+4 and\n\t\t// n1-4 are constant across all iterations, thus stack frame\n\t\t// can be partially initialize:\n\t\t\"\tmovl   8(%%ebp), %%eax\t\t\\n\"\n\t\t\"\taddl         $4, %%eax\t\t\\n\"\n\t\t\"\tmovl      %%eax, 0(%%esp)\t\\n\" // s1+4\n\t\t\"\t\t\t\t\t\\n\"\n\t\t\"\tmovl  12(%%ebp), %%eax\t\t\\n\"\n\t\t\"\tsubl         $4, %%eax\t\t\\n\"\n\t\t\"\tmovl      %%eax, 8(%%esp)\t\\n\" // n1-4\n\t\t\"\t\t\t\t\t\\n\"\n\t\t\n\t\t/*** main loop *********************************************************/\n\t\t\"0:\t\t\t\t\t\\n\"\n\t\t\t// load 16 bytes, we consider just 8+3 chars at the beggining\n\t\t\"\tmovdqu (%%esi), %%xmm2\t\t\\n\"\n\t\t\"\taddl $8, %%esi\t\t\t\\n\" // advance pointer: s1 += 8\n\t\t\t\t\t\t\t\n\t\t\t// xmm2 - vector of L1 distances between s1's 4-byte prefix\n\t\t\t// and sequence of eight 4-byte subvectors from xmm2\n\t\t\"\tmpsadbw $0, %%xmm1, %%xmm2\t\\n\"\n\n\t\t\t// xmm2 - word become 0xffff if L1=0, 0x0000 otherwise\n\t\t\"\tpcmpeqw %%xmm0, %%xmm2\t\t\\n\"\n\n\t\t\t// any L1=0?  if no, skip comparision inner loop\n\t\t\"\tptest   %%xmm2, %%xmm0\t\t\\n\"\n\t\t\"\tjc      1f\t\t\t\\n\"\n\n\t\t\t/*** inner loop ************************************************/\n\t\t\t// comparision inner loop: convert word mask to bitmask\n\t\t\t\"\tpmovmskb %%xmm2, %%edx\t\t\\n\"\n\t\t\t\t// we are interested in **word** indexes\n\t\t\t\"\tandl $0b0101010101010101, %%edx\t\\n\"\n\n\t\t\"\t2:\t\t\t\t\t\\n\"\n\t\t\t\"\tbsf %%edx, %%eax\t\t\\n\"\t// get next bit position\n\t\t\t\"\tjz  1f\t\t\t\t\\n\"\t// no bit set? exit loop\n\t\t\t\"\t\t\t\t\t\\n\"\n\t\t\t\"\tbtr %%eax, %%edx\t\t\\n\"\t// unset bit\n\t\t\t\"\tshr $1, %%eax\t\t\t\\n\"\t// divide position by 2\n\t\t\t\t\n\t\t\t\t// save registers before invoke strncmp\n\t\t\t\"\tmovl  %%eax, 12(%%esp)\t\t\\n\"\n\t\t\t\"\tmovl  %%ecx, 16(%%esp)\t\t\\n\"\n\t\t\t\"\tmovl  %%edx, 20(%%esp)\t\t\\n\"\n\n\t\t\t\t// update function argument\n\t\t\t\"\tleal -4(%%esi, %%eax), %%eax\t\\n\"\t\n\t\t\t\"\tmovl  %%eax, 4(%%esp)\t\t\\n\"\t// s2+4\n\n\t\t\t\t// invoke strncmp(s1+4, s2+4, n1-4)\n\t\t\t\"\tcall  strncmp\t\t\t\\n\"\n\t\t\t\"\ttest  %%eax, %%eax\t\t\\n\"\t// result == 0?\n\t\t\t\t\n\t\t\t\t// restore registers\n\t\t\t\"\tmovl  12(%%esp), %%eax\t\t\\n\"\n\t\t\t\"\tmovl  16(%%esp), %%ecx\t\t\\n\"\n\t\t\t\"\tmovl  20(%%esp), %%edx\t\t\\n\"\n\t\t\t\"\tjnz 2b\t\t\t\t\\n\"\n\n\t\t\t\"\tleal -8(%%eax, %%esi), %%eax\t\\n\"\t// eax -- address\n\t\t\t\"\tjmp 4f\t\t\t\t\\n\"\t// of s1's first occurance\n\n\t\t/*** main loop prologue ************************************************/\n\t\t\"1:\t\t\t\t\t\\n\"\n\t\t\"\tsubl $8, %%ecx\t\t\t\\n\"\n\t\t\"\tcmpl $0, %%ecx\t\t\t\\n\"\n\t\t\"\tjg   0b\t\t\t\t\\n\"\n\n\t\t\"\txorl %%eax, %%eax\t\t\\n\" // s1 not found, return NULL\n\t\t\"4:\t\t\t\t\t\\n\"\n\t\t\"\taddl   $24, %%esp\t\t\\n\" // and finally restore stack frame\n\t\t: \"=a\" (result),\n\t\t  \"=S\" (dummy),\n\t\t  \"=c\" (dummy)\n\t\t: \"S\" (s2),\n\t\t  \"c\" (n2-n1)\n\t);\n\n\treturn result;\n}\n\n\nchar* sse4_strstr_max20(char* s1, int n1, char* s2, int n2) {\n\t// 4 <= n1 <= 20, n2 > 4\n\tuint32_t dummy __attribute__((unused));\n\tchar* result;\n\t\n\t__asm__ volatile (\"movdqu (%%eax), %%xmm6\" : : \"a\" (mask[n1-5]));\n\t__asm__ volatile (\"movdqu (%%eax), %%xmm1\" : : \"a\" (s1));\n\t__asm__ volatile (\"movdqu (%%eax), %%xmm2\" : : \"a\" (s1+4));\t// xmm2 -- s1 suffix\n\t__asm__ volatile (\"pxor    %%xmm0, %%xmm0\" : : );\n\t__asm__ volatile (\n\t\t/*** main loop *********************************************************/\n\t\t\"0:\t\t\t\t\t\\n\"\n\t\t\t// load 16 bytes, MPSADBW consider just 8+3 chars at the beggining\n\t\t\"\tmovdqu (%%esi), %%xmm7\t\t\\n\"\n\t\t\"\taddl $8, %%esi\t\t\t\\n\" // advance pointer: s1 += 8\n\t\t\t\t\t\t\t\n\t\t\t// xmm2 - vector of L1 distances between s1's 4-byte prefix\n\t\t\t// and sequence of eight 4-byte subvectors from xmm2\n\t\t\"\tmpsadbw $0, %%xmm1, %%xmm7\t\\n\"\n\n\t\t\t// xmm2 - word become 0xffff if L1=0, 0x0000 otherwise\n\t\t\"\tpcmpeqw %%xmm0, %%xmm7\t\t\\n\"\n\n\t\t\t// any L1=0?  if no, skip comparision inner loop\n\t\t\"\tptest   %%xmm7, %%xmm0\t\t\\n\"\n\t\t\"\tjc      1f\t\t\t\\n\"\n\n\t\t\t/*** inner loop ************************************************/\n\t\t\t// comparision inner loop: convert word mask to bitmask\n\t\t\t\"\tpmovmskb %%xmm7, %%edx\t\t\\n\"\n\t\t\t\t// we are interested in **word** positions\n\t\t\t\"\tandl $0b0101010101010101, %%edx\t\\n\"\n\n\t\t\"\t2:\t\t\t\t\t\\n\"\n\t\t\t\"\tbsf %%edx, %%eax\t\t\\n\"\t// get next bit position\n\t\t\t\"\tjz  1f\t\t\t\t\\n\"\t// no bit set? exit loop\n\t\t\t\"\t\t\t\t\t\\n\"\n\t\t\t\"\tbtr %%eax, %%edx\t\t\\n\"\t// unset bit\n\t\t\t\"\tshr $1, %%eax\t\t\t\\n\"\t// divide position by 2\n\t\t\t\"\tmovdqu -4(%%esi, %%eax), %%xmm7\t\\n\"\n\t\t\t\"\tpcmpeqb %%xmm2, %%xmm7\t\t\\n\"\n\t\t\t\"\tptest\t%%xmm6, %%xmm7\t\t\\n\"\n\t\t\t\"\tjnc 2b\t\t\t\t\\n\"\n\n\t\t\t\"\tleal -8(%%eax, %%esi), %%eax\t\\n\"\t// eax -- address\n\t\t\t\"\tjmp 4f\t\t\t\t\\n\"\t// of s1's first occurance\n\n\t\t/*** main loop prologue ************************************************/\n\t\t\"1:\t\t\t\t\t\\n\"\n\t\t\"\tsubl $8, %%ecx\t\t\t\\n\"\n\t\t\"\tcmpl $0, %%ecx\t\t\t\\n\"\n\t\t\"\tjg   0b\t\t\t\t\\n\"\n\n\t\t\"\txorl %%eax, %%eax\t\t\\n\" // s1 not found, return NULL\n\t\t\"4:\t\t\t\t\t\\n\"\n\t\t: \"=a\" (result),\n\t\t  \"=S\" (dummy),\n\t\t  \"=c\" (dummy)\n\t\t: \"S\" (s2),\n\t\t  \"c\" (n2-n1)\n\t);\n\n\treturn result;\n}\n\n\n\nchar* sse4_strstr_max36(char* s1, int n1, char* s2, int n2) {\n\t// 20 <= n1 <= 36, n2 > 4\n\tuint32_t dummy __attribute__((unused));\n\tchar* result;\n\t\n\t__asm__ volatile (\"movdqu (%%eax), %%xmm1\" : : \"a\" (s1));\n\t__asm__ volatile (\"movdqu (%%eax), %%xmm2\" : : \"a\" (s1+4));\t\t// xmm2 - s1[4:20]\n\t__asm__ volatile (\"movdqu (%%eax), %%xmm3\" : : \"a\" (s1+4+16));\t// xmm3 - s1[20:] (suffix)\n\t__asm__ volatile (\"movdqu (%%eax), %%xmm6\" : : \"a\" (mask[n1-5-16]));\n\t__asm__ volatile (\"pand    %%xmm6, %%xmm3\" : : );\n\t__asm__ volatile (\"pxor    %%xmm0, %%xmm0\" : : ); // packed_byte(0x00)\n\t__asm__ volatile (\"pcmpeqb %%xmm5, %%xmm5\" : : ); // packed_byte(0xff)\n\t__asm__ volatile (\n\t\t/*** main loop *********************************************************/\n\t\t\"0:\t\t\t\t\t\\n\"\n\t\t\t// load 16 bytes, MPSADBW consider just 8+3 chars at the beggining\n\t\t\"\tmovdqu (%%esi), %%xmm7\t\t\\n\"\n\t\t\"\taddl $8, %%esi\t\t\t\\n\" // advance pointer: s1 += 8\n\t\t\t\t\t\t\t\n\t\t\t// xmm2 - vector of L1 distances between s1's 4-byte prefix\n\t\t\t// and sequence of eight 4-byte subvectors from xmm2\n\t\t\"\tmpsadbw $0, %%xmm1, %%xmm7\t\\n\"\n\n\t\t\t// xmm2 - word become 0xffff if L1=0, 0x0000 otherwise\n\t\t\"\tpcmpeqw %%xmm0, %%xmm7\t\t\\n\"\n\n\t\t\t// any L1=0?  if no, skip comparision inner loop\n\t\t\"\tptest   %%xmm7, %%xmm0\t\t\\n\"\n\t\t\"\tjc      1f\t\t\t\\n\"\n\n\t\t\t/*** inner loop ************************************************/\n\t\t\t// comparision inner loop: convert word mask to bitmask\n\t\t\t\"\tpmovmskb %%xmm7, %%edx\t\t\\n\"\n\t\t\t\t// we are interested in **word** positions\n\t\t\t\"\tandl $0b0101010101010101, %%edx\t\\n\"\n\n\t\t\"\t2:\t\t\t\t\t\\n\"\n\t\t\t\"\tbsf %%edx, %%eax\t\t\\n\"\t// get next bit position\n\t\t\t\"\tjz  1f\t\t\t\t\\n\"\t// no bit set? exit loop\n\t\t\t\"\t\t\t\t\t\\n\"\n\t\t\t\"\tbtr %%eax, %%edx\t\t\\n\"\t// unset bit\n\t\t\t\"\tshr $1, %%eax\t\t\t\\n\"\t// divide position by 2\n\t\t\t\"\tmovdqu -4(%%esi, %%eax), %%xmm7\t\\n\"\n\t\t\t\"\tmovdqu 12(%%esi, %%eax), %%xmm4\t\\n\"\n\t\t\t\"\tpand    %%xmm6, %%xmm4\t\t\\n\"\n\t\t\t\"\tpcmpeqb %%xmm2, %%xmm7\t\t\\n\"\n\t\t\t\"\tpcmpeqb %%xmm3, %%xmm4\t\t\\n\"\n\t\t\t\"\tpand    %%xmm7, %%xmm4\t\t\\n\"\n\t\t\t\"\tptest\t%%xmm5, %%xmm4\t\t\\n\"\n\t\t\t\"\tjnc 2b\t\t\t\t\\n\"\n\n\t\t\t\"\tleal -8(%%eax, %%esi), %%eax\t\\n\"\t// eax -- address\n\t\t\t\"\tjmp 4f\t\t\t\t\\n\"\t// of s1's first occurance\n\n\t\t/*** main loop prologue ************************************************/\n\t\t\"1:\t\t\t\t\t\\n\"\n\t\t\"\tsubl $8, %%ecx\t\t\t\\n\"\n\t\t\"\tcmpl $0, %%ecx\t\t\t\\n\"\n\t\t\"\tjg   0b\t\t\t\t\\n\"\n\n\t\t\"\txorl %%eax, %%eax\t\t\\n\" // s1 not found, return NULL\n\t\t\"4:\t\t\t\t\t\\n\"\n\t\t: \"=a\" (result),\n\t\t  \"=S\" (dummy),\n\t\t  \"=c\" (dummy)\n\t\t: \"S\" (s2),\n\t\t  \"c\" (n2-n1)\n\t);\n\n\treturn result;\n}\n\n\nchar* sse4_strstr_len4(char* s1, int n1, char* s2, int n2) {\n\t// n1 == 4, n2 > 4\n\tuint32_t dummy __attribute__((unused));\n\tchar* result;\n\t\n\t__asm__ volatile (\"movdqu (%%eax), %%xmm1\" : : \"a\" (s1));\n\t__asm__ volatile (\"pxor    %%xmm0, %%xmm0\" : : );\n\t__asm__ volatile (\n\t\t/*** main loop *********************************************************/\n\t\t\"0:\t\t\t\t\t\\n\"\n\t\t\t// load 16 bytes, we consider just 8+3 chars at the beggining\n\t\t\"\tmovdqu (%%esi), %%xmm2\t\t\\n\"\n\t\t\"\taddl $8, %%esi\t\t\t\\n\" // advance pointer: s1 += 8\n\t\t\t\t\t\t\t\n\t\t\t// xmm2 - vector of L1 distances between s1's 4-byte prefix\n\t\t\t// and sequence of eight 4-byte subvectors from xmm2\n\t\t\"\tmpsadbw $0, %%xmm1, %%xmm2\t\\n\"\n\n\t\t\t// xmm2 - word become 0xffff if L1=0, 0x0000 otherwise\n\t\t\"\tpcmpeqw %%xmm0, %%xmm2\t\t\\n\"\n\n\t\t\t// any L1=0?  if no, skip comparision inner loop\n\t\t\"\tptest   %%xmm2, %%xmm0\t\t\\n\"\n\t\t\"\tjnc     1f\t\t\t\\n\"\n\n\t\t\"\tsubl $8, %%ecx\t\t\t\\n\"\n\t\t\"\tcmpl $0, %%ecx\t\t\t\\n\"\n\t\t\"\tjg   0b\t\t\t\t\\n\"\n\n\t\t\"\txorl %%eax, %%eax\t\t\\n\" // s1 not found, return NULL\n\t\t\"\tjmp  2f\t\t\t\t\\n\"\n\n\t\t\"1:\t\t\t\t\t\\n\"\n\t\t\"\tpmovmskb %%xmm2, %%eax\t\t\\n\"\n\t\t\"\tbsfl      %%eax, %%eax\t\t\\n\"\n\t\t\"\tshrl         $1, %%eax\t\t\\n\"\n\t\t\"\tlea -8(%%esi, %%eax), %%eax\t\\n\"\n\t\t\"2:\t\t\t\t\t\\n\"\n\t\t: \"=a\" (result),\n\t\t  \"=S\" (dummy),\n\t\t  \"=c\" (dummy)\n\t\t: \"S\" (s2),\n\t\t  \"c\" (n2-n1)\n\t);\n\n\treturn result;\n}\n\n\nchar* sse4_strstr_len3(char* s1, int n1, char* s2, int n2) {\n\t// n1 == 4, n2 > 4\n\tuint32_t dummy __attribute__((unused));\n\tchar* result;\n\t\n\t__asm__ volatile (\"movdqu (%%eax), %%xmm1\" : : \"a\" (s1));\n\t__asm__ volatile (\"pxor    %%xmm0, %%xmm0\" : : );\n\t__asm__ volatile (\n\t\t/*** main loop *********************************************************/\n\t\t\"0:\t\t\t\t\t\\n\"\n\t\t\t// load 16 bytes, we consider just 8+3 chars at the beggining\n\t\t\"\tmovdqu (%%esi), %%xmm2\t\t\\n\"\n\t\t\"\taddl $8, %%esi\t\t\t\\n\" // advance pointer: s1 += 8\n\t\t\"\tmovdqa  %%xmm2, %%xmm3\t\t\\n\"\n\t\t\"\tpsrldq      $3, %%xmm3\t\t\\n\"\n\t\t\"\tpmovzxbw %%xmm3, %%xmm3\t\t\\n\"\n\t\t\t\t\t\t\t\n\t\t\t// xmm2 - vector of L1 distances between s1's 4-byte prefix\n\t\t\t// and sequence of eight 4-byte subvectors from xmm2\n\t\t\"\tmpsadbw $0, %%xmm1, %%xmm2\t\\n\"\n\t\t\"\tpsubw   %%xmm3, %%xmm2\t\t\\n\"\n\n\t\t\t// xmm2 - word become 0xffff if L1=0, 0x0000 otherwise\n\t\t\"\tpcmpeqw %%xmm0, %%xmm2\t\t\\n\"\n\n\t\t\t// any L1=0?  if no, skip comparision inner loop\n\t\t\"\tptest   %%xmm2, %%xmm0\t\t\\n\"\n\t\t\"\tjnc     1f\t\t\t\\n\"\n\n\t\t\"\tsubl $8, %%ecx\t\t\t\\n\"\n\t\t\"\tcmpl $0, %%ecx\t\t\t\\n\"\n\t\t\"\tjg   0b\t\t\t\t\\n\"\n\n\t\t\"\txorl %%eax, %%eax\t\t\\n\" // s1 not found, return NULL\n\t\t\"\tjmp  2f\t\t\t\t\\n\"\n\n\t\t\"1:\t\t\t\t\t\\n\"\n\t\t\"\tpmovmskb %%xmm2, %%eax\t\t\\n\"\n\t\t\"\tbsfl      %%eax, %%eax\t\t\\n\"\n\t\t\"\tshrl         $1, %%eax\t\t\\n\"\n\t\t\"\tlea -8(%%esi, %%eax), %%eax\t\\n\"\n\t\t\"2:\t\t\t\t\t\\n\"\n\t\t: \"=a\" (result),\n\t\t  \"=S\" (dummy),\n\t\t  \"=c\" (dummy)\n\t\t: \"S\" (s2),\n\t\t  \"c\" (n2-n1)\n\t);\n\n\treturn result;\n}\n\n\n// sample\nuint8_t buffer[1024*500 + 1];\n\n\nvoid help() {\n\tputs(\"prog file sse4|libc|verify iter-count string\");\n\tputs(\"* iter-count > 0\");\n\texit(1);\n}\n\n\nint main(int argc, char* argv[]) {\n\tFILE* f;\n\tint i;\n\tint size;\n\n\tif (argc != 5)\n\t\thelp();\n\n\tf = fopen(argv[1], \"r\");\n\tif (!f) {\n\t\tprintf(\"can't open '%s'\\n\", argv[1]);\n\t\treturn 2;\n\t}\n\t\t\n\tsize = fread(buffer, 1, sizeof(buffer), f);\n\tbuffer[size] = 0;\n\tfclose(f);\n\n\tint fun = -1, iters, n1;\n\tchar* s1;\n\tif (strcasecmp(\"sse4\", argv[2]) == 0)\n\t\tfun = 0;\n\telse\n\tif (strcasecmp(\"libc\", argv[2]) == 0)\n\t\tfun = 1;\n\telse\n\tif (strcasecmp(\"verify\", argv[2]) == 0)\n\t\tfun = 2;\n\telse\n\t\thelp();\n\n\tif (atoi(argv[3]) <= 0 && (fun != 2))\n\t\thelp();\n\telse\n\t\titers = atoi(argv[3]);\n\n\ts1 = argv[4];\n\tn1 = strlen(s1);\n\tif ((n1 < 3))\n\t\thelp();\n\telse\n\t\tprintf(\"s1(%d)='%s' s2(%d)\\n\", n1, s1, size);\n\n\tchar* r1;\n\tchar* r2;\n\n\tswitch (fun) {\n\t\tcase 0:\n\t\t\tputs(\"SSE4\");\n\t\t\tfor (i=0; i < iters; i++)\n\t\t\t\tsse4_strstr(s1, n1, (char*)buffer, size);\n\t\t\tbreak;\n\n\t\tcase 1:\n\t\t\tputs(\"Lib C\");\n\t\t\tfor (i=0; i < iters; i++) {\n\t\t\t\t//(unsigned int)strstr((char*)buffer, s1);\n\t\t\t\t__asm__ volatile (\n\t\t\t\t\t\"movl $buffer,  (%%esp)\\n\"\n\t\t\t\t\t\"movl      %0, 4(%%esp)\\n\"\n\t\t\t\t\t\"call strstr\\n\"\n\t\t\t\t\t:\n\t\t\t\t\t: \"r\" (s1)\n\t\t\t\t\t: \"eax\", \"ecx\", \"edx\"\n\t\t\t\t);\n\t\t\t}\n\t\t\tbreak;\n\t\t\n\t\tcase 2:\n\t\t\tputs(\"verify\");\n\t\t\tr1 = strstr((char*)buffer, s1);\n\t\t\tr2 = sse4_strstr(s1, n1, (char*)buffer, size);\n\t\t\t\n\t\t\tprintf(\"LibC = %u\\n\", (unsigned int)r1);\n\t\t\tprintf(\"SSE4 = %u %s\\n\",\n\t\t\t\t(unsigned int)r2,\n\t\t\t\t(r1 != r2) ? \"FAILED!!!\" : \"ok\"\n\t\t\t);\n\t\t\t\t\n\t\t\tif (r1 != r2)\n\t\t\t\treturn 1;\n\t}\n\n\n\treturn 0;\n}\n\n// eof\n"
  },
  {
    "path": "results/armv7-32bit-gcc4.9.2.txt",
    "content": "./speedup_arm data/i386.txt data/words 1\nstd::strstr                             ... reference result = 810807651, time =   7.318775 s\nstd::string::find                       ... reference result = 810807651, time =   4.171311 s\nSWAR 32-bit (generic)                   ... reference result = 810807651, time =   2.450585 s\nARM Neon 32 bit (v2)                    ... reference result = 810807651, time =   1.299383 s\n./speedup_arm data/i386.txt data/words 1\nstd::strstr                             ... reference result = 810807651, time =   7.329223 s\nstd::string::find                       ... reference result = 810807651, time =   4.188313 s\nSWAR 32-bit (generic)                   ... reference result = 810807651, time =   2.461333 s\nARM Neon 32 bit (v2)                    ... reference result = 810807651, time =   1.305622 s\n./speedup_arm data/i386.txt data/words 1\nstd::strstr                             ... reference result = 810807651, time =   7.304049 s\nstd::string::find                       ... reference result = 810807651, time =   4.172608 s\nSWAR 32-bit (generic)                   ... reference result = 810807651, time =   2.451913 s\nARM Neon 32 bit (v2)                    ... reference result = 810807651, time =   1.300619 s\n./speedup_arm data/i386.txt data/words 1\nstd::strstr                             ... reference result = 810807651, time =   7.307621 s\nstd::string::find                       ... reference result = 810807651, time =   4.176439 s\nSWAR 32-bit (generic)                   ... reference result = 810807651, time =   2.451030 s\nARM Neon 32 bit (v2)                    ... reference result = 810807651, time =   1.299274 s\n./speedup_arm data/i386.txt data/words 1\nstd::strstr                             ... reference result = 810807651, time =   7.313498 s\nstd::string::find                       ... reference result = 810807651, time =   4.175714 s\nSWAR 32-bit (generic)                   ... reference result = 810807651, time =   2.451439 s\nARM Neon 32 bit (v2)                    ... reference result = 810807651, time =   1.298613 s\n"
  },
  {
    "path": "results/armv8-64bit-clang3.8.0.txt",
    "content": "std::strstr                             ... reference result = 810807651, time =   3.457578 s\nstd::string::find                       ... reference result = 810807651, time =   1.821379 s\nSWAR 64-bit (generic)                   ... reference result = 810807651, time =   0.463006 s\nSWAR 32-bit (generic)                   ... reference result = 810807651, time =   0.810749 s\nARM Neon 32 bit (v2)                    ... reference result = 810807651, time =   0.407214 s\nAArch64 64 bit (v2)                     ... reference result = 810807651, time =   0.279203 s\nstd::strstr                             ... reference result = 810807651, time =   3.381364 s\nstd::string::find                       ... reference result = 810807651, time =   1.813678 s\nSWAR 64-bit (generic)                   ... reference result = 810807651, time =   0.462694 s\nSWAR 32-bit (generic)                   ... reference result = 810807651, time =   0.810882 s\nARM Neon 32 bit (v2)                    ... reference result = 810807651, time =   0.406888 s\nAArch64 64 bit (v2)                     ... reference result = 810807651, time =   0.278970 s\nstd::strstr                             ... reference result = 810807651, time =   4.118293 s\nstd::string::find                       ... reference result = 810807651, time =   1.822696 s\nSWAR 64-bit (generic)                   ... reference result = 810807651, time =   0.463028 s\nSWAR 32-bit (generic)                   ... reference result = 810807651, time =   0.810933 s\nARM Neon 32 bit (v2)                    ... reference result = 810807651, time =   0.407296 s\nAArch64 64 bit (v2)                     ... reference result = 810807651, time =   0.279606 s\nstd::strstr                             ... reference result = 810807651, time =   3.375462 s\nstd::string::find                       ... reference result = 810807651, time =   1.821449 s\nSWAR 64-bit (generic)                   ... reference result = 810807651, time =   0.462863 s\nSWAR 32-bit (generic)                   ... reference result = 810807651, time =   0.811320 s\nARM Neon 32 bit (v2)                    ... reference result = 810807651, time =   0.407274 s\nAArch64 64 bit (v2)                     ... reference result = 810807651, time =   0.279285 s\nstd::strstr                             ... reference result = 810807651, time =   3.378566 s\nstd::string::find                       ... reference result = 810807651, time =   1.825054 s\nSWAR 64-bit (generic)                   ... reference result = 810807651, time =   0.462957 s\nSWAR 32-bit (generic)                   ... reference result = 810807651, time =   0.811188 s\nARM Neon 32 bit (v2)                    ... reference result = 810807651, time =   0.407364 s\nAArch64 64 bit (v2)                     ... reference result = 810807651, time =   0.279490 s\n"
  },
  {
    "path": "results/bulldozer-fx-8510-gcc4.8.4-sse.txt",
    "content": "./speedup data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   9.390892 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   2.938355 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.788781 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   1.989833 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   2.060081 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   2.006810 s\n./speedup data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   9.387153 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   2.948608 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.789325 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   1.988635 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   2.066327 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   2.007233 s\n./speedup data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   9.377923 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   2.967027 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.788709 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   1.989077 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   2.065608 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   2.007228 s\n\n"
  },
  {
    "path": "results/cascadelake-Gold-5217-gcc-7.4.0-avx512bw.txt",
    "content": "./speedup_avx512bw data/i386.txt data/words\nscalar (naive)                          ... reference result = 8108076510, time =   4.095307 s\nstd::strstr                             ... reference result = 8108076510, time =   0.492459 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   1.243510 s\nSWAR 32-bit (generic)                   ... reference result = 8108076510, time =   2.349437 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.443313 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   0.583372 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   0.822263 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.311350 s\nSSE (naive)                             ... reference result = 8108076510, time =   1.757493 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =   0.531920 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   0.338738 s\nAVX2 (naive)                            ... reference result = 8108076510, time =   1.013489 s\nAVX2-wide (naive)                       ... reference result = 8107771150, time =   0.480182 s\nAVX512F (MPSADBW-like)                  ... reference result = 8108076510, time =   0.634909 s\nAVX512F (generic)                       ... reference result = 8108076510, time =   0.281276 s\nAVX512BW (generic)                      ... reference result = 8108076510, time =   0.256798 s\n./speedup_avx512bw data/i386.txt data/words\nscalar (naive)                          ... reference result = 8108076510, time =   4.089051 s\nstd::strstr                             ... reference result = 8108076510, time =   0.492275 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   1.243637 s\nSWAR 32-bit (generic)                   ... reference result = 8108076510, time =   2.343078 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.443659 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   0.584467 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   0.822993 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.313485 s\nSSE (naive)                             ... reference result = 8108076510, time =   1.760697 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =   0.531827 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   0.338912 s\nAVX2 (naive)                            ... reference result = 8108076510, time =   1.012637 s\nAVX2-wide (naive)                       ... reference result = 8107771150, time =   0.478455 s\nAVX512F (MPSADBW-like)                  ... reference result = 8108076510, time =   0.636537 s\nAVX512F (generic)                       ... reference result = 8108076510, time =   0.279054 s\nAVX512BW (generic)                      ... reference result = 8108076510, time =   0.255777 s\n./speedup_avx512bw data/i386.txt data/words\nscalar (naive)                          ... reference result = 8108076510, time =   4.092489 s\nstd::strstr                             ... reference result = 8108076510, time =   0.489993 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   1.241418 s\nSWAR 32-bit (generic)                   ... reference result = 8108076510, time =   2.346954 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.442109 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   0.583955 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   0.822657 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.312243 s\nSSE (naive)                             ... reference result = 8108076510, time =   1.757719 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =   0.532528 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   0.338666 s\nAVX2 (naive)                            ... reference result = 8108076510, time =   1.013151 s\nAVX2-wide (naive)                       ... reference result = 8107771150, time =   0.477202 s\nAVX512F (MPSADBW-like)                  ... reference result = 8108076510, time =   0.634753 s\nAVX512F (generic)                       ... reference result = 8108076510, time =   0.280525 s\nAVX512BW (generic)                      ... reference result = 8108076510, time =   0.256838 s\n"
  },
  {
    "path": "results/haswell-i7-4770-gcc5.4.1-avx2.txt",
    "content": "./speedup_avx2 data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   0.528137 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   1.605520 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.554532 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   0.897859 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   0.996473 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.559956 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =   0.615836 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   0.386747 s\n./speedup_avx2 data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   0.527864 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   1.577149 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.554352 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   0.897752 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   0.996771 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.560012 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =   0.615825 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   0.386528 s\n./speedup_avx2 data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   0.528205 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   1.591732 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.554423 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   0.897921 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   0.996889 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.559919 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =   0.615783 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   0.386609 s\n"
  },
  {
    "path": "results/knights-landing-7210-gcc5.3.0-avx512f.txt",
    "content": "./speedup_avx512 data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   4.964439 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   8.205818 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   6.126381 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =  18.737857 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =  13.745691 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   6.306659 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =  13.179747 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   4.113571 s\nAVX512F (MPSADBW-like)                  ... reference result = 8108076510, time =   2.348848 s\nAVX512F (generic)                       ... reference result = 8108076510, time =   1.164081 s\n./speedup_avx512 data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   4.946063 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   8.172884 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   6.107860 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =  18.717146 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =  13.724856 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   6.288685 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =  13.151361 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   4.094781 s\nAVX512F (MPSADBW-like)                  ... reference result = 8108076510, time =   2.327864 s\nAVX512F (generic)                       ... reference result = 8108076510, time =   1.142747 s\n./speedup_avx512 data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   4.949234 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   8.170751 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   6.109035 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =  18.716665 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =  13.727568 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   6.289994 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =  13.153943 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   4.094941 s\nAVX512F (MPSADBW-like)                  ... reference result = 8108076510, time =   2.326156 s\nAVX512F (generic)                       ... reference result = 8108076510, time =   1.140567 s\n"
  },
  {
    "path": "results/postprocess.py",
    "content": "from collections import OrderedDict\n\ndef load(file):\n    D = OrderedDict()\n    for line in file:\n        if 'reference result' not in line:\n            continue\n\n        name, tail = line.split('...')\n        name = name.strip()\n        time = float(tail.split()[6])\n\n        if name not in D:\n            D[name] = time\n        else:\n            D[name] = min(time, D[name])\n\n    return D\n\n\ndef main():\n    import sys\n    paths = sys.argv[1:]\n    for path in paths:\n        if len(paths) > 1:\n            print path\n\n        with open(path, 'rt') as f:\n            for name, time in load(f).iteritems():\n                print '%-30s %10.5f' % (name, time)\n\n\nif __name__ == '__main__':\n    main()\n"
  },
  {
    "path": "results/skylake-i7-6700-gcc5.4.1-avx2.txt",
    "content": "./speedup_avx2 data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   0.662049 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   1.404260 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.489281 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   0.638782 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   0.879433 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.390802 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =   0.570455 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   0.363694 s\n./speedup_avx2 data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   0.662266 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   1.404036 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.489313 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   0.638926 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   0.879193 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.390626 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =   0.569980 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   0.363876 s\n./speedup_avx2 data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   0.661478 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   1.405280 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.488631 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   0.638753 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   0.879345 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.390670 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =   0.569808 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   0.363091 s\n"
  },
  {
    "path": "results/skylake-i9-7900-gcc-5.4.1-avx512bw.txt",
    "content": "./speedup_avx512bw data/i386.txt data/words \nnaive scalar                            ... reference result = 8108076510, time =   4.872957 s\nstd::strstr                             ... reference result = 8108076510, time =   0.401080 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   1.237922 s\nSWAR 32-bit (generic)                   ... reference result = 8108076510, time =   2.044511 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.385573 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   0.580510 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   0.674341 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.058753 s\nSSE (naive)                             ... reference result = 8108076510, time =   1.709206 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =   0.444774 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   0.274761 s\nAVX2 (naive)                            ... reference result = 8108076510, time =   0.918683 s\nAVX2 (naive unrolled)                   ... reference result = 8108076510, time =   0.463246 s\nAVX2-wide (naive)                       ... reference result = 8107771150, time =   0.441233 s\nAVX512F (MPSADBW-like)                  ... reference result = 8108076510, time =   0.507046 s\nAVX512F (generic)                       ... reference result = 8108076510, time =   0.262774 s\nAVX512BW (generic)                      ... reference result = 8108076510, time =   0.220457 s\n./speedup_avx512bw data/i386.txt data/words \nnaive scalar                            ... reference result = 8108076510, time =   4.816247 s\nstd::strstr                             ... reference result = 8108076510, time =   0.398468 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   1.239442 s\nSWAR 32-bit (generic)                   ... reference result = 8108076510, time =   2.050195 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.384561 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   0.582862 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   0.675480 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.061598 s\nSSE (naive)                             ... reference result = 8108076510, time =   1.676643 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =   0.439711 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   1.638515 s\nAVX2 (naive)                            ... reference result = 8108076510, time =   0.984768 s\nAVX2 (naive unrolled)                   ... reference result = 8108076510, time =   0.494318 s\nAVX2-wide (naive)                       ... reference result = 8107771150, time =   0.479306 s\nAVX512F (MPSADBW-like)                  ... reference result = 8108076510, time =   0.553042 s\nAVX512F (generic)                       ... reference result = 8108076510, time =   0.290909 s\nAVX512BW (generic)                      ... reference result = 8108076510, time =   0.237055 s\n./speedup_avx512bw data/i386.txt data/words \nnaive scalar                            ... reference result = 8108076510, time =   6.406914 s\nstd::strstr                             ... reference result = 8108076510, time =   0.401352 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   1.237499 s\nSWAR 32-bit (generic)                   ... reference result = 8108076510, time =   2.043457 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.385167 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   0.581361 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   0.675044 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.059933 s\nSSE (naive)                             ... reference result = 8108076510, time =   1.671910 s\nAVX2 (MPSADBW)                          ... reference result = 8108076510, time =   0.444940 s\nAVX2 (generic)                          ... reference result = 8108076510, time =   0.276522 s\nAVX2 (naive)                            ... reference result = 8108076510, time =   0.921444 s\nAVX2 (naive unrolled)                   ... reference result = 8108076510, time =   0.464818 s\nAVX2-wide (naive)                       ... reference result = 8107771150, time =   0.442211 s\nAVX512F (MPSADBW-like)                  ... reference result = 8108076510, time =   0.511326 s\nAVX512F (generic)                       ... reference result = 8108076510, time =   0.265488 s\nAVX512BW (generic)                      ... reference result = 8108076510, time =   0.221329 s\n"
  },
  {
    "path": "results/westmere-m540-gcc6.2.0-sse4.txt",
    "content": "./speedup data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   0.832291 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   2.498591 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.745890 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   1.450405 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   1.238676 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.699681 s\n./speedup data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   0.822457 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   2.518604 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.750936 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   1.470000 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   1.239929 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.707411 s\n./speedup data/i386.txt data/words \nstd::strstr                             ... reference result = 8108076510, time =   0.827280 s\nSWAR 64-bit (generic)                   ... reference result = 8108076510, time =   2.535406 s\nSSE2 (generic)                          ... reference result = 8108076510, time =   0.747252 s\nSSE4.1 (MPSADBW)                        ... reference result = 8108076510, time =   1.456153 s\nSSE4.1 (MPSADBW unrolled)               ... reference result = 8108076510, time =   1.238485 s\nSSE4.2 (PCMPESTRM)                      ... reference result = 8108076510, time =   1.711734 s\n"
  },
  {
    "path": "scalar.cpp",
    "content": "// Implementation by Daniel Lemire\n// https://github.com/WojciechMula/sse4-strstr/issues/2\n\nsize_t strstr_naive(const char * hay, size_t size, const char *needle, size_t needlesize) {\n\n  if (size == needlesize) {\n    return memcmp(hay, needle, size) == 0 ? 0 : std::string::npos;\n  }\n\n  const char first = needle[0];\n  const ssize_t maxpos = ssize_t(size) - ssize_t(needlesize) + 1;\n  for(ssize_t i = 0; i < maxpos; i++) {\n    if(hay[i] != first) {\n       i++;\n       while( i < maxpos && hay[i] != first ) i++;\n       if ( i == maxpos ) break;\n    }\n    size_t j = 1;\n    for( ; j < needlesize; ++j)\n      if(hay[ i + j ] != needle[ j ] ) break;\n    if( j == needlesize) return i;\n  }\n  return std::string::npos;\n}\n"
  },
  {
    "path": "src/all.h",
    "content": "#pragma once\n\n#include \"common.h\"\n#include <utils/bits.cpp>\n#include <errno.h>\n#include \"fixed-memcmp.cpp\"\n#include \"scalar.cpp\"\n#include \"swar64-strstr-v2.cpp\"\n#include \"swar32-strstr-v2.cpp\"\n#ifdef HAVE_SSE_INSTRUCTIONS\n#   include <utils/sse.cpp>\n#   include \"sse4-strstr.cpp\"\n#   include \"sse4-strstr-unrolled.cpp\"\n#   include \"sse4.2-strstr.cpp\"\n#   include \"sse2-strstr.cpp\"\n#   include \"sse-naive-strstr.cpp\"\n#   include \"sse2-needle4.cpp\"\n#endif\n#ifdef HAVE_AVX2_INSTRUCTIONS\n#   include <utils/avx2.cpp>\n#   include \"avx2-strstr.cpp\"\n#   include \"avx2-strstr-v2.cpp\"\n#   include \"avx2-naive-strstr.cpp\"\n#   include \"avx2-naive-strstr64.cpp\"\n#   include \"avx2-naive-unrolled-strstr.cpp\"\n#endif\n#ifdef HAVE_AVX512F_INSTRUCTIONS\n#   include \"avx512f-strstr.cpp\"\n#   include \"avx512f-strstr-v2.cpp\"\n#endif\n#ifdef HAVE_AVX512BW_INSTRUCTIONS\n#   include \"avx512bw-strstr-v2.cpp\"\n#   include \"avx512bw-strstr-v3.cpp\"\n#endif\n#ifdef HAVE_NEON_INSTRUCTIONS\n#   include <utils/neon.cpp>\n#   include \"neon-strstr-v2.cpp\"\n#endif\n#ifdef HAVE_AARCH64_ARCHITECTURE\n#   include \"aarch64-strstr-v2.cpp\"\n#endif\n\n"
  },
  {
    "path": "src/all_procedures.cpp",
    "content": "#include \"all.h\"\n\n#include <string>\n#include <vector>\n#include <algorithm>\n#include <stdexcept>\n\nusing str_find_fun = size_t (*)(const char*, size_t, const char*, size_t);\n\nstruct Procedures {\n\n    struct Item {\n        str_find_fun proc;\n        std::string name;\n        char code;\n        bool builtin;\n\n        Item(str_find_fun proc_, const char* name_, char code_, bool builtin_ = false)\n            : proc(proc_)\n            , name(name_)\n            , code(code_)\n            , builtin(builtin_) {}\n    };\n\n    std::vector<Item> procedures;\n\n    const Item& operator[](char code) {\n        auto pred = [code](const Item& item){return item.code == code;};\n        auto it = std::find_if(procedures.begin(), procedures.end(), pred);\n\n        if (it == procedures.end()) {\n            throw std::logic_error(\"can't find procedure with code '\" + std::string(1, code) + \"'\");\n        }\n\n        return *it;\n    }\n};\n\nsize_t strstr_libc(const char* s, size_t, const char* needle, size_t) {\n    const char* ptr = strstr(s, needle);\n    if (ptr) {\n        return ptr - s;\n    } else {\n        return std::string::npos;\n    }\n}\n\nProcedures all_procedures() {\n\n    Procedures db;\n\n    db.procedures.emplace_back(\n        strstr_naive,\n        \"scalar (naive)\",\n        'a'\n    );\n\n    db.procedures.emplace_back(\n        strstr_libc,\n        \"std::strstr\",\n        'b',\n        true\n    );\n\n    db.procedures.emplace_back(\n        nullptr,\n        \"std::string::find\",\n        'c',\n        true\n    );\n\n#define REGISTER(code, name, procedure)                 \\\n    {                                                   \\\n        str_find_fun f = procedure;                     \\\n        db.procedures.emplace_back(f, name, code);      \\\n    }\n\n    REGISTER('d', \"SWAR 64-bit (generic)\", swar64_strstr_v2);\n    REGISTER('e', \"SWAR 32-bit (generic)\", swar32_strstr_v2);\n\n#ifdef HAVE_SSE_INSTRUCTIONS\n    REGISTER('f', \"SSE2 (generic)\", sse2_strstr_v2);\n    REGISTER('g', \"SSE4.1 (MPSADBW)\", sse4_strstr);\n    REGISTER('h', \"SSE4.1 (MPSADBW unrolled)\", sse4_strstr_unrolled);\n    REGISTER('i', \"SSE4.2 (PCMPESTRM)\", sse42_strstr);\n    REGISTER('j', \"SSE (naive)\", sse_naive_strstr);\n    REGISTER('v', \"SSE2 (4-byte needle)\", sse2_strstr_needle4);\n    REGISTER('w', \"SSE2 (4-byte needle v2)\", sse2_strstr_needle4_v2);\n#endif\n#ifdef HAVE_AVX2_INSTRUCTIONS\n    REGISTER('k', \"AVX2 (MPSADBW)\", avx2_strstr);\n    REGISTER('l', \"AVX2 (generic)\", avx2_strstr_v2);\n    REGISTER('m', \"AVX2 (naive)\", avx2_naive_strstr);\n    REGISTER('n', \"AVX2 (naive unrolled)\", avx2_naive_unrolled_strstr);\n    REGISTER('o', \"AVX2-wide (naive)\", avx2_naive_strstr64);\n#endif\n\n#ifdef HAVE_AVX512F_INSTRUCTIONS\n    REGISTER('p', \"AVX512F (MPSADBW-like)\", avx512f_strstr);\n    REGISTER('q', \"AVX512F (generic)\", avx512f_strstr_v2);\n#endif\n\n#ifdef HAVE_AVX512BW_INSTRUCTIONS\n    REGISTER('r', \"AVX512BW (generic)\", avx512bw_strstr_v2);\n    REGISTER('s', \"AVX512BW (masked)\", avx512bw_strstr_v3);\n#endif\n\n#ifdef HAVE_NEON_INSTRUCTIONS\n    REGISTER('t', \"ARM Neon 32 bit (v2)\", neon_strstr_v2);\n#endif\n\n#ifdef HAVE_AARCH64_ARCHITECTURE\n    REGISTER('u', \"AArch64 64 bit (v2)\", aarch64_strstr_v2);\n#endif\n\n#undef REGISTER\n    return db;\n}\n"
  },
  {
    "path": "src/application_base.cpp",
    "content": "class ApplicationBase {\n\nprotected:\n    std::string file;\n    std::vector<std::string> words;\n\npublic:\n    class Error final {\n    public:\n        const std::string message;\n\n    public:\n        Error(const std::string& msg) : message(msg) {}\n    };\n\npublic:\n    void prepare(const std::string& file_name, const std::string& words_name) {\n\n        load_text(file_name);\n        load_words(words_name);\n    }\n\nprivate:\n    void load_text(const std::string& path) {\n\n        FILE* f = fopen(path.c_str(), \"rt\");\n        if (f == nullptr) {\n            throw_errno(path);\n        }\n\n        fseek(f, -1, SEEK_END);\n        const auto size = ftell(f);\n\n        rewind(f);\n\n        char* buffer = new char[size];\n        fread(buffer, size, 1, f);\n        buffer[size] = 0;\n        fclose(f);\n\n        file = buffer;\n\n        delete[] buffer;\n    }\n\n\n    void load_words(const std::string& path) {\n\n        char buffer[1024];\n\n        FILE* f = fopen(path.c_str(), \"rt\");\n        if (f == nullptr) {\n            throw_errno(path);\n        }\n\n        while (!feof(f)) {\n            fgets(buffer, sizeof(buffer), f);\n\n            const auto len = strlen(buffer);\n            if (buffer[len - 1] == '\\n') {\n                buffer[len - 1] = 0;\n                if (len == 1) // skip empty strings\n                    continue;\n            }\n\n            words.push_back(buffer);\n        }\n\n        fclose(f);\n    }\n\n\n    void throw_errno(const std::string& prefix) {\n\n        const std::string msg = prefix + \": \" + std::string(strerror(errno));\n        throw Error(msg);\n    }\n};\n\n"
  },
  {
    "path": "src/benchmark.cpp",
    "content": "#include <cstdio>\n#include <cstdint>\n#include <cassert>\n#include <cstring>\n#include <string>\n#include <vector>\n\n#include \"all_procedures.cpp\"\n\n// ------------------------------------------------------------------------\n\n#include <utils/ansi.cpp>\n#include \"benchmark.h\"\n#include \"application_base.cpp\"\n\nclass Application final: public ApplicationBase {\n\n    Procedures db;\n\npublic:\n    enum class TestType {\n        OptimisticCase,\n        Random,\n        WorstCase\n    };\n\n    struct Parameters {\n        size_t needle_position;\n        size_t needle_size;\n        size_t count;\n        TestType test_type;\n        std::string procedure_codes;\n    };\n\npublic:\n    Application(const Parameters& params)\n        : db(all_procedures())\n        , parameters(params) {\n\n        prepare();\n    }\n\n    bool operator()() {\n\n        // strstr is treated as built-in function by GCC\n        // it seems it's wiped out in benchmark\n        const bool measure_stdstring  = false;\n\n#if defined(HAVE_NEON_INSTRUCTIONS) && !defined(HAVE_AARCH64_ARCHITECTURE)\n        // On Raspberry Pi it's terribly slow, but on Aarch64\n        // the 64-bit procedure is pretty fast\n        const bool measure_swar64     = false;\n#else\n        const bool measure_swar64     = true;\n#endif\n\n        if (is_enabled('a')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return strstr_naive(s.data(), s.size(), neddle.data(), neddle.size());\n            };\n\n            measure(find, 'a');\n        }\n\n        if (is_enabled('b')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n                const char* res = strstr(s.data(), neddle.data());\n\n                if (res != nullptr) {\n                    return res - s.data();\n                } else {\n                    return std::string::npos;\n                }\n            };\n\n            measure(find, 'b');\n        }\n\n        if (measure_stdstring && is_enabled('c')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return s.find(neddle);\n            };\n\n            measure(find, 'c');\n        }\n\n        if (measure_swar64 && is_enabled('d')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return swar64_strstr_v2(s, neddle);\n            };\n\n            measure(find, 'd');\n        }\n\n        if (is_enabled('e')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return swar32_strstr_v2(s, neddle);\n            };\n\n            measure(find, 'e');\n        }\n\n#ifdef HAVE_SSE_INSTRUCTIONS\n        if (is_enabled('f')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return sse2_strstr_v2(s, neddle);\n            };\n\n            measure(find, 'f');\n        }\n\n        if (is_enabled('g')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return sse4_strstr(s, neddle);\n            };\n\n            measure(find, 'g');\n        }\n\n        if (is_enabled('h')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return sse4_strstr_unrolled(s, neddle);\n            };\n\n            measure(find, 'h');\n        }\n\n        if (is_enabled('i')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return sse42_strstr(s, neddle);\n            };\n\n            measure(find, 'i');\n        }\n\n        if (is_enabled('j')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return sse_naive_strstr(s, neddle);\n            };\n\n            measure(find, 'j');\n        }\n\n        if (is_enabled('v')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return sse2_strstr_needle4(s, neddle);\n            };\n\n            measure(find, 'v');\n        }\n\n        if (is_enabled('w')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return sse2_strstr_needle4_v2(s, neddle);\n            };\n\n            measure(find, 'w');\n        }\n#endif\n\n#ifdef HAVE_AVX2_INSTRUCTIONS\n        if (is_enabled('k')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx2_strstr(s, neddle);\n            };\n\n            measure(find, 'k');\n        }\n\n        if (is_enabled('l')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx2_strstr_v2(s, neddle);\n            };\n\n            measure(find, 'l');\n        }\n\n        if (is_enabled('m')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx2_naive_strstr(s, neddle);\n            };\n\n            measure(find, 'm');\n        }\n\n        if (is_enabled('n')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx2_naive_unrolled_strstr(s, neddle);\n            };\n\n            measure(find, 'n');\n        }\n\n        if (is_enabled('o')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx2_naive_strstr64(s, neddle);\n            };\n\n            measure(find, 'o');\n        }\n#endif\n\n#ifdef HAVE_AVX512F_INSTRUCTIONS\n        if (is_enabled('p')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx512f_strstr(s, neddle);\n            };\n\n            measure(find, 'p');\n        }\n\n        if (is_enabled('q')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx512f_strstr_v2(s, neddle);\n            };\n\n            measure(find, 'q');\n        }\n#endif\n\n#ifdef HAVE_AVX512BW_INSTRUCTIONS\n\tif (is_enabled('r')) {\n\n        auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n            return avx512bw_strstr_v2(s, neddle);\n        };\n\n        measure(find, 'r');\n    }\n\n\tif (is_enabled('u')) {\n\n        auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n            return avx512bw_strstr_v3(s, neddle);\n        };\n\n        measure(find, 'u');\n    }\n#endif\n\n#ifdef HAVE_NEON_INSTRUCTIONS\n        if (is_enabled('s')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return neon_strstr_v2(s, neddle);\n            };\n\n            measure(find, 's');\n        }\n#endif\n\n#ifdef HAVE_AARCH64_ARCHITECTURE\n        if (is_enabled('t')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return aarch64_strstr_v2(s, neddle);\n            };\n\n            measure(find, 't');\n        }\n#endif\n\n        return true;\n    }\n\n\n    static void print_help(const char* progname) {\n        std::printf(\"%s needle-position needle-size iteration-count test-name [procedures]\\n\", progname);\n        std::puts(\"\");\n        std::puts(\"Parameters:\");\n        std::puts(\"\");\n        std::puts(\"  needle-position  position of the needle\");\n        std::puts(\"  needle-size      length of the needle\");\n        std::puts(\"  count            how many times test is repeated\");\n        std::puts(\"  test-name        one of 'optimistic', 'random', 'worst'\");\n        std::puts(\"  procedures       procedure code(s), listed below [by default all will be tested]\");\n        std::puts(\"\");\n        std::puts(\"Test kinds\");\n        std::puts(\"\");\n        std::puts(\"  optimistic       data before needle contains characters don't present in the needle\");\n        std::puts(\"  random           data before needle contains some random characters\");\n        std::puts(\"  worst            needle has form 'aaa...aaaXaaa...aaa', and data before is filled with the 'a'\");\n        std::puts(\"\");\n        std::puts(\"Following procedures are available:\");\n        for (auto& item: all_procedures().procedures) {\n            printf(\" [%c] %s\\n\", item.code, item.name.c_str());\n        }\n    }\n\n\nprivate:\n    volatile size_t sink;\n\n    template <typename T_FIND>\n    void measure(T_FIND find, char code) {\n\n        BEST_TIME(/**/,\n                  sink = find(input, needle),\n                  db[code].name.c_str(),\n                  parameters.count,\n                  parameters.needle_position);\n    }\n\n\n    bool is_enabled(char proc) const {\n        return (parameters.procedure_codes.empty())\n            || (parameters.procedure_codes.find(proc) != std::string::npos);\n    }\n\n    void prepare_needle() {\n\n        needle.append(parameters.needle_size/2, 'a');\n        needle.append(1, 'X');\n        needle.append(parameters.needle_size - needle.size(), 'a');\n    }\n\n    void prepare_input() {\n\n        const size_t padding = 256;\n\n        switch (parameters.test_type) {\n            case TestType::OptimisticCase:\n                input.assign(parameters.needle_position, '_');\n                break;\n\n            case TestType::WorstCase:\n                input.assign(parameters.needle_position, 'a');\n                break;\n\n            case TestType::Random:\n                for (size_t i=0; i < parameters.needle_position; i++) {\n                    const char c = rand() % ('z' - 'a' + 1) + 'a';\n                    input.push_back(c);\n                }\n                break;\n        }\n\n        input += needle;\n        input.append(padding, '_'); // to make sure that memory after the needle is accessible\n    }\n\n    void prepare() {\n        prepare_needle();\n        prepare_input();\n    }\n\n    std::string needle;\n    std::string input;\n    Parameters parameters;\n};\n\n\nbool parse(int argc, char* argv[], Application::Parameters& p) {\n\n    if (argc < 5) {\n        return false;\n    }\n\n    for (int i=1; i < argc; i++) {\n        const std::string tmp = argv[i];\n        if (tmp == \"-h\" || tmp == \"--help\")\n            return false;\n    }\n\n    p.needle_position = atoi(argv[1]);\n    p.needle_size     = atoi(argv[2]);\n    p.count           = atoi(argv[3]);\n\n    if (p.needle_size < 3) {\n        throw std::runtime_error(\"needle size must be greater than 2\");\n    }\n\n    if (p.count == 0) {\n        throw std::runtime_error(\"count must be greater than 0\");\n    }\n\n    std::string tmp(argv[4]);\n    if (tmp == \"optimistic\") {\n        p.test_type = Application::TestType::OptimisticCase;\n    } else if (tmp == \"worst\") {\n        p.test_type = Application::TestType::WorstCase;\n    } else if (tmp == \"random\") {\n        p.test_type = Application::TestType::Random;\n    } else {\n        throw std::runtime_error(\"expected 'optimistic', 'worst' or 'random', got '\" + tmp + \"'\");\n    }\n\n    if (argc >= 6) {\n        p.procedure_codes = argv[5];\n    }\n\n    return true;\n}\n\n\nint main(int argc, char* argv[]) {\n\n    try {\n\n        Application::Parameters params;\n        if (!parse(argc, argv, params)) {\n            Application::print_help(argv[0]);\n            return EXIT_FAILURE;\n        }\n\n        Application app(params);\n        return app() ? EXIT_SUCCESS : EXIT_FAILURE;\n\n    } catch (std::runtime_error& err) {\n\n        const auto msg = ansi::seq(\"Error\", ansi::RED);\n        printf(\"%s: %s\\n\", msg.data(), err.what());\n\n        return EXIT_FAILURE;\n    } catch (ApplicationBase::Error& err) {\n\n        const auto msg = ansi::seq(\"Error\", ansi::RED);\n        printf(\"%s: %s\\n\", msg.data(), err.message.data());\n\n        return EXIT_FAILURE;\n    }\n}\n\n"
  },
  {
    "path": "src/benchmark.h",
    "content": "#ifndef _BENCHMARK_H_\n#define _BENCHMARK_H_\n\n#include <stdint.h>\n#define RDTSC_START(cycles)                                             \\\n    do {                                                                \\\n        uint32_t cyc_high, cyc_low;                                     \\\n        __asm volatile(\"cpuid\\n\"                                        \\\n                       \"rdtsc\\n\"                                        \\\n                       \"mov %%edx, %0\\n\"                                \\\n                       \"mov %%eax, %1\" :                                \\\n                       \"=r\" (cyc_high),                                 \\\n                       \"=r\"(cyc_low) :                                  \\\n                       : /* no read only */                             \\\n                       \"%rax\", \"%rbx\", \"%rcx\", \"%rdx\" /* clobbers */    \\\n                       );                                               \\\n        (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                \\\n    } while (0)\n\n#define RDTSC_STOP(cycles)                                              \\\n    do {                                                                \\\n        uint32_t cyc_high, cyc_low;                                     \\\n        __asm volatile(\"rdtscp\\n\"                                       \\\n                       \"mov %%edx, %0\\n\"                                \\\n                       \"mov %%eax, %1\\n\"                                \\\n                       \"cpuid\" :                                        \\\n                       \"=r\"(cyc_high),                                  \\\n                       \"=r\"(cyc_low) :                                  \\\n                       /* no read only registers */ :                   \\\n                       \"%rax\", \"%rbx\", \"%rcx\", \"%rdx\" /* clobbers */    \\\n                       );                                               \\\n        (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                \\\n    } while (0)\n\nstatic __attribute__ ((noinline))\nuint64_t rdtsc_overhead_func(uint64_t dummy) {\n    return dummy;\n}\n\nuint64_t global_rdtsc_overhead = (uint64_t) UINT64_MAX;\n\n#define RDTSC_SET_OVERHEAD(test, repeat)                            \\\n  do {                                                              \\\n    uint64_t cycles_start, cycles_final, cycles_diff;               \\\n    uint64_t min_diff = UINT64_MAX;                                 \\\n    for (unsigned i = 0; i < repeat; i++) {                         \\\n      __asm volatile(\"\" ::: /* pretend to clobber */ \"memory\");     \\\n      RDTSC_START(cycles_start);                                    \\\n      test;                                                         \\\n      RDTSC_STOP(cycles_final);                                     \\\n      cycles_diff = (cycles_final - cycles_start);                  \\\n      if (cycles_diff < min_diff) min_diff = cycles_diff;           \\\n    }                                                               \\\n    global_rdtsc_overhead = min_diff;                               \\\n    printf(\"rdtsc_overhead set to %d\\n\", (int)global_rdtsc_overhead);     \\\n  } while (0)                                                       \\\n\n\n/*\n * Prints the best number of operations per cycle where\n * test is the function call, answer is the expected answer generated by\n * test, repeat is the number of times we should repeat and size is the\n * number of operations represented by test.\n */\n#define BEST_TIME(pre, test, test_name, repeat, size)                   \\\n    do {                                                                \\\n        if (global_rdtsc_overhead == UINT64_MAX) {                      \\\n           RDTSC_SET_OVERHEAD(rdtsc_overhead_func(1), repeat);          \\\n        }                                                               \\\n        printf(\"%-30s\\t: \", test_name); fflush(stdout);                 \\\n        uint64_t cycles_start, cycles_final, cycles_diff;               \\\n        uint64_t min_diff = (uint64_t)-1;                               \\\n        uint64_t sum_diff = 0;                                          \\\n        for (size_t i = 0; i < repeat; i++) {                           \\\n            pre;                                                        \\\n            __asm volatile(\"\" ::: /* pretend to clobber */ \"memory\");   \\\n            RDTSC_START(cycles_start);                                  \\\n            test;                                                       \\\n            RDTSC_STOP(cycles_final);                                   \\\n            cycles_diff = (cycles_final - cycles_start - global_rdtsc_overhead); \\\n            if (cycles_diff < min_diff) min_diff = cycles_diff;         \\\n            sum_diff += cycles_diff;                                    \\\n        }                                                               \\\n        uint64_t S = size;                                              \\\n        float cycle_per_op = (min_diff) / (double)S;                    \\\n        float avg_cycle_per_op = (sum_diff) / ((double)S * repeat);     \\\n        printf(\" %8.3f cycle/op (best) %8.3f cycle/op (avg)\\n\", cycle_per_op, avg_cycle_per_op); \\\n } while (0)\n\n#endif\n"
  },
  {
    "path": "src/speedup.cpp",
    "content": "#include <cstdio>\n#include <cstdint>\n#include <cassert>\n#include <cstring>\n#include <string>\n#include <vector>\n#include <chrono>\n\n#include \"all_procedures.cpp\"\n\n// ------------------------------------------------------------------------\n\n#include <utils/ansi.cpp>\n#include \"application_base.cpp\"\n\nclass Application final: public ApplicationBase {\n\n    Procedures db;\n    std::size_t count;\n    const std::string procedure_codes;\n\npublic:\n    struct Parameters {\n        std::string file_name;\n        std::string words_name;\n        size_t count = 10;\n        std::string procedure_codes;\n    };\n\npublic:\n    Application(const Parameters& params)\n        : db(all_procedures())\n        , count(params.count)\n        , procedure_codes(params.procedure_codes) {\n\n        prepare(params.file_name, params.words_name);\n    }\n\n    bool operator()() {\n\n#if defined(__GNUC__) && !defined(HAVE_NEON_INSTRUCTIONS)\n        // GNU std::string::find was proven to be utterly slow,\n        // don't waste our time on reconfirming that fact.\n        //\n        // (On Raspberry Pi it's fast, though)\n        const bool measure_stdstring  = false;\n#else\n        const bool measure_stdstring  = true;\n#endif\n#if defined(HAVE_NEON_INSTRUCTIONS) && !defined(HAVE_AARCH64_ARCHITECTURE)\n        // On Raspberry Pi it's terribly slow, but on Aarch64\n        // the 64-bit procedure is pretty fast\n        const bool measure_swar64     = false;\n#else\n        const bool measure_swar64     = true;\n#endif\n\n        if (is_enabled('a')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return strstr_naive(s.data(), s.size(), neddle.data(), neddle.size());\n            };\n\n            measure(find, 'a');\n        }\n\n        if (is_enabled('b')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n                const char* res = strstr(s.data(), neddle.data());\n\n                if (res != nullptr) {\n                    return res - s.data();\n                } else {\n                    return std::string::npos;\n                }\n            };\n\n            measure(find, 'b');\n        }\n\n        if (measure_stdstring && is_enabled('c')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return s.find(neddle);\n            };\n\n            measure(find, 'c');\n        }\n\n        if (measure_swar64 && is_enabled('d')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return swar64_strstr_v2(s, neddle);\n            };\n\n            measure(find, 'd');\n        }\n\n        if (is_enabled('e')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return swar32_strstr_v2(s, neddle);\n            };\n\n            measure(find, 'e');\n        }\n\n#ifdef HAVE_SSE_INSTRUCTIONS\n        if (is_enabled('f')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return sse2_strstr_v2(s, neddle);\n            };\n\n            measure(find, 'f');\n        }\n\n        if (is_enabled('g')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return sse4_strstr(s, neddle);\n            };\n\n            measure(find, 'g');\n        }\n\n        if (is_enabled('h')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return sse4_strstr_unrolled(s, neddle);\n            };\n\n            measure(find, 'h');\n        }\n\n        if (is_enabled('i')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return sse42_strstr(s, neddle);\n            };\n\n            measure(find, 'i');\n        }\n\n        if (is_enabled('j')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return sse_naive_strstr(s, neddle);\n            };\n\n            measure(find, 'j');\n        }\n#endif\n\n#ifdef HAVE_AVX2_INSTRUCTIONS\n        if (is_enabled('k')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx2_strstr(s, neddle);\n            };\n\n            measure(find, 'k');\n        }\n\n        if (is_enabled('l')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx2_strstr_v2(s, neddle);\n            };\n\n            measure(find, 'l');\n        }\n\n        if (is_enabled('m')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx2_naive_strstr(s, neddle);\n            };\n\n            measure(find, 'm');\n        }\n\n        if (is_enabled('n')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx2_naive_unrolled_strstr(s, neddle);\n            };\n\n            measure(find, 'n');\n        }\n\n        if (is_enabled('o')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx2_naive_strstr64(s, neddle);\n            };\n\n            measure(find, 'o');\n        }\n#endif\n\n#ifdef HAVE_AVX512F_INSTRUCTIONS\n        if (is_enabled('p')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx512f_strstr(s, neddle);\n            };\n\n            measure(find, 'p');\n        }\n\n        if (is_enabled('q')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx512f_strstr_v2(s, neddle);\n            };\n\n            measure(find, 'q');\n        }\n#endif\n\n#ifdef HAVE_AVX512BW_INSTRUCTIONS\n\tif (is_enabled('r')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return avx512bw_strstr_v2(s, neddle);\n            };\n\n            measure(find, 'r');\n        }\n#endif\n\n#ifdef HAVE_NEON_INSTRUCTIONS\n        if (is_enabled('s')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return neon_strstr_v2(s, neddle);\n            };\n\n            measure(find, 's');\n        }\n#endif\n\n#ifdef HAVE_AARCH64_ARCHITECTURE\n        if (is_enabled('t')) {\n\n            auto find = [](const std::string& s, const std::string& neddle) -> size_t {\n\n                return aarch64_strstr_v2(s, neddle);\n            };\n\n            measure(find, 't');\n        }\n#endif\n\n        return true;\n    }\n\n\n    static void print_help(const char* progname) {\n        std::printf(\"%s file words [count] [procedure]\\n\", progname);\n        std::puts(\"\");\n        std::puts(\"Parameters:\");\n        std::puts(\"\");\n        std::puts(\"  file      - arbitrary file\");\n        std::puts(\"  words     - list of words in separate lines\");\n        std::puts(\"  count     - repeat count (optional, default = 10)\");\n        std::puts(\"  procedure - letter(s) from square brackets (by default all functions are checked)\");\n        std::puts(\"\");\n        std::puts(\"Following procedures ara available:\");\n        for (auto& item: all_procedures().procedures) {\n            printf(\" [%c] %s\\n\", item.code, item.name.c_str());\n        }\n    }\n\n\nprivate:\n    template <typename T_FIND>\n    void measure(T_FIND find, char code) {\n\n        printf(\"%-40s... \", db[code].name.c_str());\n        fflush(stdout);\n\n        size_t result = 0;\n\n        const auto t1 = std::chrono::high_resolution_clock::now();\n\n        auto k = count;\n        while (k != 0) {\n            for (const auto& word: words) {\n                result += find(file, word);\n            }\n\n            k--;\n        }\n\n        const auto t2 = std::chrono::high_resolution_clock::now();\n        const std::chrono::duration<double> td = t2-t1;\n\n        printf(\"reference result = %lu, time = %10.6f s\\n\", result, td.count());\n    }\n\n\n    bool is_enabled(char proc) const {\n        return (procedure_codes.empty())\n            || (procedure_codes.find(proc) != std::string::npos);\n    }\n};\n\n\nbool parse(int argc, char* argv[], Application::Parameters& p) {\n    if (argc < 3) {\n        return false;\n    }\n\n    for (int i=1; i < argc; i++) {\n        const std::string tmp = argv[i];\n        if (tmp == \"-h\" || tmp == \"--help\")\n            return false;\n    }\n\n    p.file_name = argv[1];\n    p.words_name = argv[2];\n\n    if (argc >= 4) {\n        size_t tmp = atoi(argv[3]);\n        if (tmp > 0) {\n            p.count = tmp;\n        } else {\n            printf(\"repeat count '%s' invalid, keeping default %lu\\n\", argv[3], p.count);\n        }\n    }\n\n    if (argc >= 5) {\n        p.procedure_codes = argv[4];\n    }\n\n    return true;\n}\n\n\nint main(int argc, char* argv[]) {\n\n    try {\n\n        Application::Parameters params;\n        if (!parse(argc, argv, params)) {\n            Application::print_help(argv[0]);\n            return EXIT_FAILURE;\n        }\n\n        Application app(params);\n        return app() ? EXIT_SUCCESS : EXIT_FAILURE;\n\n    } catch (ApplicationBase::Error& err) {\n\n        const auto msg = ansi::seq(\"Error: \", ansi::RED);\n        printf(\"%s: %s\\n\", msg.data(), err.message.data());\n\n        return EXIT_FAILURE;\n    }\n}\n"
  },
  {
    "path": "src/unittests.cpp",
    "content": "#include <cstdio>\n#include <cstdint>\n#include <cassert>\n#include <cstring>\n#include <string>\n#include <vector>\n\n#include \"all.h\"\n\n#include <utils/ansi.cpp>\n#include \"all_procedures.cpp\"\n\nbool test(const char* name, str_find_fun strstr_function) {\n\n    std::printf(\"%s... \", name);\n    std::fflush(stdout);\n\n    for (size_t size = 1; size < 64; size++) {\n\n        const std::string neddle = \"$\" + std::string(size, 'x') + \"#\";\n\n        for (size_t n = 0; n < 3*16; n++) {\n\n            const std::string prefix(n, '.');\n\n            for (size_t k = 0; k < 3*16; k++) {\n                // '.' * k + '$' + 'x' * size + '#' + '.' * k\n\n                const std::string suffix(k, '.');\n                const std::string str = prefix + neddle + suffix;\n\n                const auto result = strstr_function(str.data(), str.size(), neddle.data(), neddle.size());\n\n                if (result != n) {\n                    printf(\"%s\\n\", ansi::seq(\"FAILED\", ansi::RED).c_str());\n\n                    printf(\"   string = '%s' (length %lu)\\n\", str.data(), str.size());\n                    printf(\"   neddle = '%s' (length %lu)\\n\", neddle.data(), neddle.size());\n                    printf(\"   expected result = %lu, actual result = %lu\\n\", n, result);\n\n                    return false;\n                }\n            }\n        }\n    }\n\n    const auto msg = ansi::seq(\"OK\", ansi::GREEN);\n    printf(\"%s\\n\", msg.c_str());\n\n    return true;\n}\n\n\nint main() {\n\n    int ret = EXIT_SUCCESS;\n\n    puts(\"running unit tests\");\n\n    auto db = all_procedures();\n    for (auto& item: db.procedures) {\n        if (item.builtin) {\n            continue;\n        }\n\n        if (!test(item.name.c_str(), item.proc)) {\n            ret = EXIT_FAILURE;\n        }\n    }\n\n    return ret;\n}\n\n"
  },
  {
    "path": "src/validate.cpp",
    "content": "#include <cstdio>\n#include <cstdint>\n#include <cassert>\n#include <cstring>\n#include <string>\n#include <vector>\n\n// ------------------------------------------------------------------------\n\n#include \"all_procedures.cpp\"\n\n// ------------------------------------------------------------------------\n\n#include <utils/ansi.cpp>\n#include \"application_base.cpp\"\n\n\nclass Application final: public ApplicationBase {\n\npublic:\n    Application(const std::string& file_name, const std::string& words_name) {\n        prepare(file_name, words_name);\n    }\n\n    bool run() {\n        const auto n = words.size();\n\n        auto db = all_procedures();\n\n        for (size_t i = 0; i < n; i++) {\n\n            if (i % 100 == 0) {\n                print_progress(i, n);\n            }\n\n            const auto& word = words[i];\n            const size_t reference = file.find(word);\n\n            for (auto& item: db.procedures) {\n                if (item.builtin) {\n                    continue;\n                }\n                \n                const size_t result = item.proc(file.data(), file.size(), word.data(), word.size());\n                if (reference != result) {\n                    putchar('\\n');\n                    const auto msg = ansi::seq(\"ERROR\", ansi::RED);\n                    printf(\"%s: std::find result = %lu, %s = %lu\\n\",\n                        msg.data(), reference, item.name.c_str(), result);\n\n                    printf(\"word: '%s' (length %lu)\\n\", word.data(), word.size());\n\n                    return false;\n                }\n            }\n        }\n\n        print_progress(n, n);\n        putchar('\\n');\n\n        const auto msg = ansi::seq(\"OK\", ansi::GREEN);\n        printf(\"%s\\n\", msg.c_str());\n\n        return true;\n    }\n\n\n    static void print_help(const char* progname) {\n        std::printf(\"usage: %s [file] [words]\\n\", progname);\n        std::puts(\"\");\n        std::puts(\"Search all words in a file using std::string::find and SSE4 procedure\");\n        std::puts(\"\");\n        std::puts(\"Parameters:\");\n        std::puts(\"\");\n        std::puts(\"  file  - arbitrary file\");\n        std::puts(\"  words - list of words in separate lines\");\n    }\n\nprivate:\n    void print_progress(size_t pos, size_t n) {\n\n        printf(\"validating... %0.2f%% (%lu/%lu)\\r\", 100.0*pos/n, pos, n);\n        fflush(stdout);\n    }\n};\n\n\nint main(int argc, char* argv[]) {\n\n\n    if (argc == 3) {\n        try {\n            Application app(argv[1], argv[2]);\n\n            const auto ret = app.run();\n\n            return ret ? EXIT_SUCCESS : EXIT_FAILURE;\n        } catch (ApplicationBase::Error& err) {\n\n            const auto msg = ansi::seq(\"Error: \", ansi::RED);\n            printf(\"%s: %s\\n\", msg.data(), err.message.data());\n\n            return EXIT_FAILURE;\n        }\n    } else {\n        Application::print_help(argv[0]);\n\n        return EXIT_FAILURE;\n    }\n}\n"
  },
  {
    "path": "sse-naive-strstr.cpp",
    "content": "// Method descibed in https://arxiv.org/pdf/1612.01506.pdf\n//\n// Implementation by Daniel Lemire\n// https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/extra/simd/substring/substring.c\n\nsize_t FORCE_INLINE sse_naive_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    if (n == k) {\n        return (memcmp(s, needle, k) == 0) ? 0 : std::string::npos;\n    }\n\n    for (size_t i = 0; i < n - k + 1; i += 16) {\n        uint16_t found = 0xffff;\n        for (size_t j = 0; (j < k) && (found != 0) ; ++j) {\n            const __m128i textvector = _mm_loadu_si128((const __m128i *)(s + i + j));\n            const __m128i needlevector = _mm_set1_epi8(needle[j]);\n            uint16_t bitmask = _mm_movemask_epi8(_mm_cmpeq_epi8(textvector, needlevector));\n            found = found & bitmask;\n        }\n        if (found != 0) {\n            return i + __builtin_ctz(found);\n        }\n    }\n\n    return std::string::npos;\n}\n\n\n// ------------------------------------------------------------------------\n\nsize_t sse_naive_strstr(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tresult = sse_naive_strstr_anysize(s, n, needle, k);\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse_naive_strstr(const std::string& s, const std::string& needle) {\n\n    return sse_naive_strstr(s.data(), s.size(), needle.data(), needle.size());\n}\n\n\n"
  },
  {
    "path": "sse2-needle4.cpp",
    "content": "size_t FORCE_INLINE sse2_needle4(const char* s, size_t n, const char* needle, size_t k) {\n\n    uint32_t u32;\n    memcpy(&u32, needle, sizeof(u32));\n\n    const __m128i v_needle = _mm_set1_epi32(u32);\n    const __m128i shuffle  = _mm_setr_epi8(0, 1, 2, 3,\n                                           1, 2, 3, 4,\n                                           2, 3, 4, 5,\n                                           3, 4, 5, 6);\n\n    for (size_t i = 0; i < n - k + 1; i += 4) {\n        // 1. load 7 bytes:\n        // [abcd|efg?|????|????]\n        uint64_t u64;\n        memcpy(&u64, &s[i], sizeof(u64));\n        const __m128i t0 = _mm_cvtsi64x_si128(u64);\n\n        // 2. make all possible 4-byte substrings\n        // [abcd|bcde|cdef|defg]\n        const __m128i t1 = _mm_shuffle_epi8(shuffle, t0);\n\n        // 3. compare the 4-byte substrings with the needle\n        const __m128i t2 = _mm_cmpeq_epi32(v_needle, t1);\n\n        const int mask = _mm_movemask_ps((__m128)t2);\n        if (mask != 0) {\n            return i + __builtin_clz(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n\n// ------------------------------------------------------------------------\n\nsize_t sse2_strstr_needle4(const char* s, size_t n, const char* needle, size_t k) {\n\n    if (k != 4) {\n        return std::string::npos;\n    }\n\n    return sse2_needle4(s, n, needle, k);\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse2_strstr_needle4(const std::string& s, const std::string& needle) {\n\n    return sse2_strstr_needle4(s.data(), s.size(), needle.data(), needle.size());\n}\n\n\nsize_t FORCE_INLINE sse2_needle4_v2(const char* s, size_t n, const char* needle, size_t k) {\n\n    uint32_t u32;\n    memcpy(&u32, needle, sizeof(u32));\n\n    const __m128i v_needle = _mm_set1_epi32(u32);\n    const __m128i shuffle0 = _mm_setr_epi8(0, 1, 2, 3,\n                                           1, 2, 3, 4,\n                                           2, 3, 4, 5,\n                                           3, 4, 5, 6);\n    const __m128i shuffle1 = _mm_setr_epi8(4, 5,  6,  7,\n                                           5, 6,  7,  8,\n                                           6, 7,  8,  9,\n                                           7, 8,  9, 10);\n\n    for (size_t i = 0; i < n - k + 1; i += 8) {\n        // 1. load 15 ytes:\n        // [abcd|efgh|ijkl|????]\n        const __m128i input = _mm_loadu_si128((const __m128i*)(s + i));\n\n        // 2a. make all possible 4-byte substrings\n        //     lo = [abcd|bcde|cdef|defg]\n        const __m128i lo = _mm_shuffle_epi8(shuffle0, input);\n\n        //     hi = [efgh|fghi|ghij|hijk]\n        const __m128i hi = _mm_shuffle_epi8(shuffle1, input);\n\n        // 3. compare the 4-byte substrings with the needle\n        const __m128i eq_lo = _mm_cmpeq_epi32(v_needle, lo);\n        const __m128i eq_hi = _mm_cmpeq_epi32(v_needle, hi);\n\n        // to perform single movemask in the main loop\n        const __m128i t0 = _mm_or_si128(eq_lo, eq_hi);\n\n        const int mask = _mm_movemask_ps((__m128)t0);\n        if (mask != 0) {\n            const int mask_lo = _mm_movemask_ps((__m128)eq_lo);\n            if (mask_lo != 0) {\n                return i + __builtin_clz(mask_lo);\n            } else {\n                return i + 4 + __builtin_clz(mask);\n            }\n        }\n    }\n\n    return std::string::npos;\n}\n\n\n// ------------------------------------------------------------------------\n\nsize_t sse2_strstr_needle4_v2(const char* s, size_t n, const char* needle, size_t k) {\n\n    if (k != 4) {\n        return std::string::npos;\n    }\n\n    return sse2_needle4_v2(s, n, needle, k);\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse2_strstr_needle4_v2(const std::string& s, const std::string& needle) {\n\n    return sse2_strstr_needle4_v2(s.data(), s.size(), needle.data(), needle.size());\n}\n\n\n"
  },
  {
    "path": "sse2-strstr.cpp",
    "content": "// implements scheme described in http://0x80.pl/articles/simd-friendly-karp-rabin.html\n\nsize_t FORCE_INLINE sse2_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    const __m128i first = _mm_set1_epi8(needle[0]);\n    const __m128i last  = _mm_set1_epi8(needle[k - 1]);\n\n    for (size_t i = 0; i < n; i += 16) {\n\n        const __m128i block_first = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));\n        const __m128i block_last  = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i + k - 1));\n\n        const __m128i eq_first = _mm_cmpeq_epi8(first, block_first);\n        const __m128i eq_last  = _mm_cmpeq_epi8(last, block_last);\n\n        uint16_t mask = _mm_movemask_epi8(_mm_and_si128(eq_first, eq_last));\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask);\n\n            if (memcmp(s + i + bitpos + 1, needle + 1, k - 2) == 0) {\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\ntemplate <size_t k, typename MEMCMP>\nsize_t FORCE_INLINE sse2_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    const __m128i first = _mm_set1_epi8(needle[0]);\n    const __m128i last  = _mm_set1_epi8(needle[k - 1]);\n\n    for (size_t i = 0; i < n; i += 16) {\n\n        const __m128i block_first = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));\n        const __m128i block_last  = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i + k - 1));\n\n        const __m128i eq_first = _mm_cmpeq_epi8(first, block_first);\n        const __m128i eq_last  = _mm_cmpeq_epi8(last, block_last);\n\n        uint32_t mask = _mm_movemask_epi8(_mm_and_si128(eq_first, eq_last));\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask);\n\n            if (memcmp_fun(s + i + bitpos + 1, needle + 1)) {\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse2_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tswitch (k) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\n        case 2:\n            result = sse2_strstr_memcmp<2>(s, n, needle, always_true);\n            break;\n\n        case 3:\n            result = sse2_strstr_memcmp<3>(s, n, needle, memcmp1);\n            break;\n\n        case 4:\n            result = sse2_strstr_memcmp<4>(s, n, needle, memcmp2);\n            break;\n\n        case 5:\n            result = sse2_strstr_memcmp<5>(s, n, needle, memcmp4);\n            break;\n\n        case 6:\n            result = sse2_strstr_memcmp<6>(s, n, needle, memcmp4);\n            break;\n\n        case 7:\n            result = sse2_strstr_memcmp<7>(s, n, needle, memcmp5);\n            break;\n\n        case 8:\n            result = sse2_strstr_memcmp<8>(s, n, needle, memcmp6);\n            break;\n\n        case 9:\n            result = sse2_strstr_memcmp<9>(s, n, needle, memcmp8);\n            break;\n\n        case 10:\n            result = sse2_strstr_memcmp<10>(s, n, needle, memcmp8);\n            break;\n\n        case 11:\n            result = sse2_strstr_memcmp<11>(s, n, needle, memcmp9);\n            break;\n\n        case 12:\n            result = sse2_strstr_memcmp<12>(s, n, needle, memcmp10);\n            break;\n\n\t\tdefault:\n\t\t\tresult = sse2_strstr_anysize(s, n, needle, k);\n            break;\n    }\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse2_strstr_v2(const std::string& s, const std::string& needle) {\n\n    return sse2_strstr_v2(s.data(), s.size(), needle.data(), needle.size());\n}\n\n\n"
  },
  {
    "path": "sse4-strstr-unrolled.cpp",
    "content": "// Note: it appears that these specialized functions do not help.\n//       But I decided to left them, just in case.\n\n// use functions/templates dealing with certain substring length\n//#define ENABLE_SSE4_LENGTH_SPECIALIZATIONS\n\n// When defined use sse4_strstr_unrolled_memcmp template,\n// otherwise use just sse4_strstr_unrolled_max20 and sse4_strstr_unrolled_max36\n//#define ENABLE_SSE4_MEMCMP_TEMPLATES\n\nsize_t sse4_strstr_unrolled_anysize(const char* s, size_t n, const char* needle, size_t needle_size) {\n\n    assert(needle_size > 4);\n    assert(n > 0);\n\n    const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));\n    const __m128i zeros  = _mm_setzero_si128();\n\n    __m128i prev = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));\n    __m128i curr;\n\n    for (size_t i = 0; i < n; i += 16) {\n\n        curr  = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i + 16));\n\n        const __m128i data0   = prev;\n        const __m128i data1   = _mm_alignr_epi8(curr, prev, 8);\n        const __m128i result0 = _mm_mpsadbw_epu8(data0, prefix, 0);\n        const __m128i result1 = _mm_mpsadbw_epu8(data1, prefix, 0);\n        prev = curr;\n\n        const __m128i result  = _mm_packus_epi16(result0, result1);\n        const __m128i cmp     = _mm_cmpeq_epi8(result, zeros);\n\n        unsigned mask = _mm_movemask_epi8(cmp);\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask);\n\n            if (memcmp(s + i + bitpos + 4, needle + 4, needle_size - 4) == 0) {\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\ntemplate <size_t k, typename MEMCMP>\nsize_t sse4_strstr_unrolled_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {\n\n    assert(k > 4);\n    assert(n > 0);\n\n    const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));\n    const __m128i zeros  = _mm_setzero_si128();\n\n    __m128i prev = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s));\n    __m128i curr;\n\n    for (size_t i = 0; i < n; i += 16) {\n\n        curr  = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i + 16));\n\n        const __m128i data0   = prev;\n        const __m128i data1   = _mm_alignr_epi8(curr, prev, 8);\n        const __m128i result0 = _mm_mpsadbw_epu8(data0, prefix, 0);\n        const __m128i result1 = _mm_mpsadbw_epu8(data1, prefix, 0);\n        prev = curr;\n\n        const __m128i result  = _mm_packus_epi16(result0, result1);\n        const __m128i cmp     = _mm_cmpeq_epi8(result, zeros);\n\n        unsigned mask = _mm_movemask_epi8(cmp);\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask);\n\n            if (memcmp_fun(s + i + bitpos + 4, needle + 4)) {\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse4_strstr_unrolled_max20(const char* s, size_t n, const char* needle, size_t needle_size) {\n\n    const __m128i zeros  = _mm_setzero_si128();\n    const __m128i prefix = sse::load(needle);\n    const __m128i suffix = sse::load(needle + 4);\n    const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4);\n\n    for (size_t i = 0; i < n; i += 8) {\n\n        const __m128i data   = sse::load(s + i);\n        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);\n\n        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);\n\n        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask)/2;\n\n            const __m128i str = sse::load(s + i + bitpos + 4);\n            const __m128i cmp = _mm_cmpeq_epi8(str, suffix);\n\n            if (_mm_testc_si128(cmp, suff_mask)) {\n\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse4_strstr_unrolled_max36(const char* s, size_t n, const char* needle, size_t needle_size) {\n\n    const __m128i zeros     = _mm_setzero_si128();\n    const __m128i prefix    = sse::load(needle);\n    const __m128i suffix1   = sse::load(needle + 4);\n    const __m128i suffix2   = sse::load(needle + 16 + 4);\n    const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4));\n\n    for (size_t i = 0; i < n; i += 8) {\n\n        const __m128i data   = sse::load(s + i);\n        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);\n\n        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);\n\n        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask)/2;\n\n            const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1);\n            const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2);\n\n            const __m128i c3 = _mm_or_si128(c2, suff_mask);\n            const __m128i tmp = _mm_and_si128(c1, c3);\n\n            if (_mm_movemask_epi8(tmp) == 0xffff) {\n\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse4_strstr_unrolled_len3(const char* s, size_t n, const char* needle) {\n\n    const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));\n    const __m128i zeros  = _mm_setzero_si128();\n\n    for (size_t i = 0; i < n; i += 8) {\n\n        const __m128i data     = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));\n        const __m128i lastbyte = _mm_cvtepu8_epi16(_mm_srli_si128(data, 3));\n        const __m128i result   = _mm_mpsadbw_epu8(data, prefix, 0);\n\n        const __m128i cmp    = _mm_cmpeq_epi16(_mm_sub_epi16(result, lastbyte), zeros);\n\n        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;\n\n        if (mask != 0) {\n\n            return i + bits::get_first_bit_set(mask)/2;\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse4_strstr_unrolled_len4(const char* s, size_t n, const char* needle) {\n\n    const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));\n    const __m128i zeros  = _mm_setzero_si128();\n\n    for (size_t i = 0; i < n; i += 8) {\n\n        const __m128i data   = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));\n        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);\n\n        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);\n\n        unsigned mask = _mm_movemask_epi8(cmp);\n\n        if (mask != 0) {\n\n            return i + bits::get_first_bit_set(mask)/2;\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse4_strstr_unrolled(const char* s, size_t n, const char* needle, size_t needle_size) {\n\n    size_t result = std::string::npos;\n\n    if (n < needle_size) {\n        return result;\n    }\n\n\tswitch (needle_size) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\n\t\tcase 2: {\n\t\t\tconst char* res = reinterpret_cast<const char*>(strstr(s, needle));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\t\tcase 3:\n\n\t\t\tresult = sse4_strstr_unrolled_len3(s, n, needle);\n            break;\n\n\t\tcase 4:\n\t\t\tresult = sse4_strstr_unrolled_len4(s, n, needle);\n            break;\n\n#ifdef ENABLE_SSE4_LENGTH_SPECIALIZATIONS\n#ifdef ENABLE_SSE4_MEMCMP_TEMPLATES\n\t\tcase 5:\n            result = sse4_strstr_unrolled_memcmp<5>(s, n, needle, memcmp1);\n            break;\n\n\t\tcase 6:\n            result = sse4_strstr_unrolled_memcmp<6>(s, n, needle, memcmp2);\n            break;\n\n\t\tcase 7:\n            result = sse4_strstr_unrolled_memcmp<7>(s, n, needle, memcmp3);\n            break;\n\n\t\tcase 8:\n            result = sse4_strstr_unrolled_memcmp<8>(s, n, needle, memcmp4);\n            break;\n\n\t\tcase 9:\n            result = sse4_strstr_unrolled_memcmp<9>(s, n, needle, memcmp5);\n            break;\n\n\t\tcase 10:\n            result = sse4_strstr_unrolled_memcmp<10>(s, n, needle, memcmp6);\n            break;\n\n\t\tcase 11:\n            result = sse4_strstr_unrolled_memcmp<11>(s, n, needle, memcmp7);\n            break;\n\n\t\tcase 12:\n            result = sse4_strstr_unrolled_memcmp<12>(s, n, needle, memcmp8);\n            break;\n\n\t\tcase 13:\n            result = sse4_strstr_unrolled_memcmp<13>(s, n, needle, memcmp9);\n            break;\n\n\t\tcase 14:\n            result = sse4_strstr_unrolled_memcmp<14>(s, n, needle, memcmp10);\n            break;\n#else\n        case 5: case 6: case 7: case 8:\n        case 9: case 10: case 11: case 12:\n        case 13: case 14: /* 5 .. 14 */\n#endif // ENABLE_SSE4_MEMCMP_TEMPLATES\n\t\tcase 15: case 16: case 17: case 18: case 19:\n\t\tcase 20: /* 15..20 */\n\t\t    result = sse4_strstr_unrolled_max20(s, n, needle, needle_size);\n            break;\n\n\t\tcase 21: case 22: case 23: case 24: case 25:\n\t\tcase 26: case 27: case 28: case 29: case 30:\n\t\tcase 31: case 32: case 33: case 34: case 35:\n\t\tcase 36: /* 21..36 */\n\t\t\tresult = sse4_strstr_unrolled_max36(s, n, needle, needle_size);\n            break;\n#endif // ENABLE_SSE4_LENGTH_SPECIALIZATIONS\n\t\tdefault:\n\t\t\tresult = sse4_strstr_unrolled_anysize(s, n, needle, needle_size);\n            break;\n    }\n\n\n    if (result <= n - needle_size) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// --------------------------------------------------\n\nsize_t sse4_strstr_unrolled(const std::string& s, const std::string& needle) {\n\n    return sse4_strstr_unrolled(s.data(), s.size(), needle.data(), needle.size());\n}\n\n\n"
  },
  {
    "path": "sse4-strstr.cpp",
    "content": "size_t sse4_strstr_anysize(const char* s, size_t n, const char* needle, size_t needle_size) {\n\n    assert(needle_size > 4);\n    assert(n > 0);\n\n    const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));\n    const __m128i zeros  = _mm_setzero_si128();\n\n    for (size_t i = 0; i < n; i += 8) {\n\n        const __m128i data   = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));\n        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);\n\n        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);\n\n        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask)/2;\n\n            if (memcmp(s + i + bitpos + 4, needle + 4, needle_size - 4) == 0) {\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\ntemplate <size_t k, typename MEMCMP>\nsize_t sse4_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {\n\n    assert(k > 4);\n    assert(n > 0);\n\n    const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));\n    const __m128i zeros  = _mm_setzero_si128();\n\n    for (size_t i = 0; i < n; i += 8) {\n\n        const __m128i data   = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));\n        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);\n\n        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);\n\n        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask)/2;\n\n            if (memcmp_fun(s + i + bitpos + 4, needle + 4)) {\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse4_strstr_max20(const char* s, size_t n, const char* needle, size_t needle_size) {\n\n    const __m128i zeros  = _mm_setzero_si128();\n    const __m128i prefix = sse::load(needle);\n    const __m128i suffix = sse::load(needle + 4);\n    const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4);\n\n    for (size_t i = 0; i < n; i += 8) {\n\n        const __m128i data   = sse::load(s + i);\n        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);\n\n        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);\n\n        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask)/2;\n\n            const __m128i str = sse::load(s + i + bitpos + 4);\n            const __m128i cmp = _mm_cmpeq_epi8(str, suffix);\n\n            if (_mm_testc_si128(cmp, suff_mask)) {\n\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse4_strstr_max36(const char* s, size_t n, const char* needle, size_t needle_size) {\n\n    const __m128i zeros     = _mm_setzero_si128();\n    const __m128i prefix    = sse::load(needle);\n    const __m128i suffix1   = sse::load(needle + 4);\n    const __m128i suffix2   = sse::load(needle + 16 + 4);\n    const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4));\n\n    for (size_t i = 0; i < n; i += 8) {\n\n        const __m128i data   = sse::load(s + i);\n        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);\n\n        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);\n\n        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask)/2;\n\n            const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1);\n            const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2);\n\n            const __m128i c3 = _mm_or_si128(c2, suff_mask);\n            const __m128i tmp = _mm_and_si128(c1, c3);\n\n            if (_mm_movemask_epi8(tmp) == 0xffff) {\n\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse4_strstr_len3(const char* s, size_t n, const char* needle) {\n\n    const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));\n    const __m128i zeros  = _mm_setzero_si128();\n\n    for (size_t i = 0; i < n; i += 8) {\n\n        const __m128i data     = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));\n        const __m128i lastbyte = _mm_cvtepu8_epi16(_mm_srli_si128(data, 3));\n        const __m128i result   = _mm_mpsadbw_epu8(data, prefix, 0);\n\n        const __m128i cmp    = _mm_cmpeq_epi16(_mm_sub_epi16(result, lastbyte), zeros);\n\n        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;\n\n        if (mask != 0) {\n\n            return i + bits::get_first_bit_set(mask)/2;\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse4_strstr_len4(const char* s, size_t n, const char* needle) {\n\n    const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle));\n    const __m128i zeros  = _mm_setzero_si128();\n\n    for (size_t i = 0; i < n; i += 8) {\n\n        const __m128i data   = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i));\n        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);\n\n        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);\n\n        unsigned mask = _mm_movemask_epi8(cmp);\n\n        if (mask != 0) {\n\n            return i + bits::get_first_bit_set(mask)/2;\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse4_strstr(const char* s, size_t n, const char* needle, size_t needle_size) {\n\n    size_t result = std::string::npos;\n\n    if (n < needle_size) {\n        return result;\n    }\n\n\tswitch (needle_size) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\t\tcase 2: {\n\t\t\tconst char* res = reinterpret_cast<const char*>(strstr(s, needle));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\t\tcase 3:\n\n\t\t\tresult = sse4_strstr_len3(s, n, needle);\n            break;\n\n\t\tcase 4:\n\t\t\tresult = sse4_strstr_len4(s, n, needle);\n            break;\n\n#if 1\n\t\tcase 5:\n            result = sse4_strstr_memcmp<5>(s, n, needle, memcmp1);\n            break;\n\n\t\tcase 6:\n            result = sse4_strstr_memcmp<6>(s, n, needle, memcmp2);\n            break;\n\n\t\tcase 7:\n            result = sse4_strstr_memcmp<7>(s, n, needle, memcmp3);\n            break;\n\n\t\tcase 8:\n            result = sse4_strstr_memcmp<8>(s, n, needle, memcmp4);\n            break;\n\n\t\tcase 9:\n            result = sse4_strstr_memcmp<9>(s, n, needle, memcmp5);\n            break;\n\n\t\tcase 10:\n            result = sse4_strstr_memcmp<10>(s, n, needle, memcmp6);\n            break;\n\n\t\tcase 11:\n            result = sse4_strstr_memcmp<11>(s, n, needle, memcmp7);\n            break;\n\n\t\tcase 12:\n            result = sse4_strstr_memcmp<12>(s, n, needle, memcmp8);\n            break;\n\n\t\tcase 13:\n            result = sse4_strstr_memcmp<13>(s, n, needle, memcmp9);\n            break;\n\n\t\tcase 14:\n            result = sse4_strstr_memcmp<14>(s, n, needle, memcmp10);\n            break;\n#else\n        case 5: case 6: case 7: case 8:\n        case 9: case 10: case 11: case 12:\n        case 13: case 14: /* 5 .. 14 */\n#endif\n\t\tcase 15: case 16: case 17: case 18: case 19:\n\t\tcase 20: /* 15..20 */\n\t\t    result = sse4_strstr_max20(s, n, needle, needle_size);\n            break;\n\n\t\tcase 21: case 22: case 23: case 24: case 25:\n\t\tcase 26: case 27: case 28: case 29: case 30:\n\t\tcase 31: case 32: case 33: case 34: case 35:\n\t\tcase 36: /* 21..36 */\n\t\t\tresult = sse4_strstr_max36(s, n, needle, needle_size);\n            break;\n\n\t\tdefault:\n\t\t\tresult = sse4_strstr_anysize(s, n, needle, needle_size);\n            break;\n    }\n\n\n    if (result <= n - needle_size) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// --------------------------------------------------\n\nsize_t sse4_strstr(const std::string& s, const std::string& needle) {\n\n    return sse4_strstr(s.data(), s.size(), needle.data(), needle.size());\n}\n\n"
  },
  {
    "path": "sse4.2-strstr.cpp",
    "content": "/* Usage of PCMPESTRM instruction from SSE 4.1 */\n\nsize_t FORCE_INLINE sse42_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    const __m128i N = _mm_loadu_si128((__m128i*)needle);\n\n    for (size_t i = 0; i < n; i += 16) {\n    \n        const int mode = _SIDD_UBYTE_OPS \n                       | _SIDD_CMP_EQUAL_ORDERED\n                       | _SIDD_BIT_MASK;\n\n        const __m128i D   = _mm_loadu_si128((__m128i*)(s + i));\n        const __m128i res = _mm_cmpestrm(N, k, D, n - i, mode);\n        uint64_t mask = _mm_cvtsi128_si64(res);\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask);\n\n            // we know that at least the first character of needle matches\n            if (memcmp(s + i + bitpos + 1, needle + 1, k - 1) == 0) {\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n\ntemplate <size_t k, typename MEMCMP>\nsize_t FORCE_INLINE sse42_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    const __m128i N = _mm_loadu_si128((__m128i*)needle);\n\n    for (size_t i = 0; i < n; i += 16) {\n    \n        const int mode = _SIDD_UBYTE_OPS \n                       | _SIDD_CMP_EQUAL_ORDERED\n                       | _SIDD_BIT_MASK;\n\n        const __m128i D   = _mm_loadu_si128((__m128i*)(s + i));\n        const __m128i res = _mm_cmpestrm(N, k, D, n - i, mode);\n        uint64_t mask = _mm_cvtsi128_si64(res);\n\n        while (mask != 0) {\n\n            const auto bitpos = bits::get_first_bit_set(mask);\n\n            if (memcmp_fun(s + i + bitpos + 1, needle + 1)) {\n                return i + bitpos;\n            }\n\n            mask = bits::clear_leftmost_set(mask);\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse42_strstr(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tswitch (k) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\n        case 2:\n            result = sse42_strstr_memcmp<2>(s, n, needle, memcmp1);\n            break;\n\n        case 3:\n            result = sse42_strstr_memcmp<3>(s, n, needle, memcmp2);\n            break;\n\n        case 4:\n            result = sse42_strstr_memcmp<4>(s, n, needle, memcmp3);\n            break;\n\n        case 5:\n            result = sse42_strstr_memcmp<5>(s, n, needle, memcmp4);\n            break;\n\n        case 6:\n            result = sse42_strstr_memcmp<6>(s, n, needle, memcmp5);\n            break;\n\n        case 7:\n            result = sse42_strstr_memcmp<7>(s, n, needle, memcmp6);\n            break;\n\n        case 8:\n            result = sse42_strstr_memcmp<8>(s, n, needle, memcmp7);\n            break;\n\n        case 9:\n            result = sse42_strstr_memcmp<9>(s, n, needle, memcmp8);\n            break;\n\n        case 10:\n            result = sse42_strstr_memcmp<10>(s, n, needle, memcmp9);\n            break;\n\n        case 11:\n            result = sse42_strstr_memcmp<11>(s, n, needle, memcmp10);\n            break;\n\n        case 12:\n            result = sse42_strstr_memcmp<12>(s, n, needle, memcmp11);\n            break;\n\n\t\tdefault:\n\t\t\tresult = sse42_strstr_anysize(s, n, needle, k);\n            break;\n    }\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n// ------------------------------------------------------------------------\n\nsize_t sse42_strstr(const std::string& s, const std::string& needle) {\n\n    return sse42_strstr(s.data(), s.size(), needle.data(), needle.size());\n}\n\n\n\n"
  },
  {
    "path": "swar32-strstr-v2.cpp",
    "content": "size_t FORCE_INLINE swar32_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    const uint32_t first = 0x01010101u * static_cast<uint8_t>(needle[0]);\n    const uint32_t last  = 0x01010101u * static_cast<uint8_t>(needle[k - 1]);\n\n    uint32_t* block_first = reinterpret_cast<uint32_t*>(const_cast<char*>(s));\n    uint32_t* block_last  = reinterpret_cast<uint32_t*>(const_cast<char*>(s + k - 1));\n\n    // 2. sequence scan\n    for (auto i=0u; i < n; i+=4, block_first++, block_last++) {\n        // 0 bytes in eq indicate matching chars\n        const uint32_t eq = (*block_first ^ first) | (*block_last ^ last);\n\n        // 7th bit set if lower 7 bits are zero\n        const uint32_t t0 = (~eq & 0x7f7f7f7fu) + 0x01010101u;\n        // 7th bit set if 7th bit is zero\n        const uint32_t t1 = (~eq & 0x80808080u);\n        uint32_t zeros = t0 & t1;\n        size_t j = 0;\n\n        while (zeros) {\n            if (zeros & 0x80) {\n                const char* substr = reinterpret_cast<char*>(block_first) + j + 1;\n                if (memcmp(substr, needle + 1, k - 2) == 0) {\n                    return i + j;\n                }\n            }\n\n            zeros >>= 8;\n            j += 1;\n        }\n    }\n\n    return std::string::npos;\n}\n\n\ntemplate <size_t k, typename MEMCMP>\nsize_t FORCE_INLINE swar32_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {\n\n    assert(n > 0);\n\n    const uint32_t first = 0x01010101u * static_cast<uint8_t>(needle[0]);\n    const uint32_t last  = 0x01010101u * static_cast<uint8_t>(needle[k - 1]);\n\n    uint32_t* block_first = reinterpret_cast<uint32_t*>(const_cast<char*>(s));\n    uint32_t* block_last  = reinterpret_cast<uint32_t*>(const_cast<char*>(s + k - 1));\n\n    // 2. sequence scan\n    for (auto i=0u; i < n; i+=4, block_first++, block_last++) {\n        const uint32_t eq = (*block_first ^ first) | (*block_last ^ last);\n        const uint32_t t0 = (~eq & 0x7f7f7f7fu) + 0x01010101u;\n        const uint32_t t1 = (~eq & 0x80808080u);\n        uint32_t zeros = t0 & t1;\n        size_t j = 0;\n    \n        while (zeros) {\n            if (zeros & 0x80) {\n                const char* substr = reinterpret_cast<char*>(block_first) + j + 1;\n                if (memcmp_fun(substr, needle + 1)) {\n                    return i + j;\n                }\n            }\n\n            zeros >>= 8;\n            j += 1;\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t swar32_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tswitch (k) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\n        case 2:\n            result = swar32_strstr_memcmp<2>(s, n, needle, always_true);\n            break;\n\n        case 3:\n            result = swar32_strstr_memcmp<3>(s, n, needle, memcmp1);\n            break;\n\n        case 4:\n            result = swar32_strstr_memcmp<4>(s, n, needle, memcmp2);\n            break;\n\n        case 5:\n            // Note: use memcmp4 rather memcmp3, as the last character\n            //       of needle is already proven to be equal\n            result = swar32_strstr_memcmp<5>(s, n, needle, memcmp4);\n            break;\n\n        case 6:\n            result = swar32_strstr_memcmp<6>(s, n, needle, memcmp4);\n            break;\n\n        case 7:\n            result = swar32_strstr_memcmp<7>(s, n, needle, memcmp5);\n            break;\n\n        case 8:\n            result = swar32_strstr_memcmp<8>(s, n, needle, memcmp6);\n            break;\n\n        case 9:\n            // Note: use memcmp8 rather memcmp7 for the same reason as above.\n            result = swar32_strstr_memcmp<9>(s, n, needle, memcmp8);\n            break;\n\n        case 10:\n            result = swar32_strstr_memcmp<10>(s, n, needle, memcmp8);\n            break;\n\n        case 11:\n            result = swar32_strstr_memcmp<11>(s, n, needle, memcmp9);\n            break;\n\n        case 12:\n            result = swar32_strstr_memcmp<12>(s, n, needle, memcmp10);\n            break;\n\n\t\tdefault:\n\t\t\tresult = swar32_strstr_anysize(s, n, needle, k);\n            break;\n    }\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n\nsize_t swar32_strstr_v2(const std::string& s, const std::string& needle) {\n\n    return swar32_strstr_v2(s.data(), s.size(), needle.data(), needle.size());\n}\n"
  },
  {
    "path": "swar64-strstr-v2.cpp",
    "content": "size_t FORCE_INLINE swar64_strstr_anysize(const char* s, size_t n, const char* needle, size_t k) {\n\n    assert(k > 0);\n    assert(n > 0);\n\n    const uint64_t first = 0x0101010101010101llu * static_cast<uint8_t>(needle[0]);\n    const uint64_t last  = 0x0101010101010101llu * static_cast<uint8_t>(needle[k - 1]);\n\n    uint64_t* block_first = reinterpret_cast<uint64_t*>(const_cast<char*>(s));\n    uint64_t* block_last  = reinterpret_cast<uint64_t*>(const_cast<char*>(s + k - 1));\n\n    // 2. sequence scan\n    for (auto i=0u; i < n; i+=8, block_first++, block_last++) {\n        // 0 bytes in eq indicate matching chars\n        const uint64_t eq = (*block_first ^ first) | (*block_last ^ last);\n\n        // 7th bit set if lower 7 bits are zero\n        const uint64_t t0 = (~eq & 0x7f7f7f7f7f7f7f7fllu) + 0x0101010101010101llu;\n        // 7th bit set if 7th bit is zero\n        const uint64_t t1 = (~eq & 0x8080808080808080llu);\n        uint64_t zeros = t0 & t1;\n        size_t j = 0;\n\n        while (zeros) {\n            if (zeros & 0x80) {\n                const char* substr = reinterpret_cast<char*>(block_first) + j + 1;\n                if (memcmp(substr, needle + 1, k - 2) == 0) {\n                    return i + j;\n                }\n            }\n\n            zeros >>= 8;\n            j += 1;\n        }\n    }\n\n    return std::string::npos;\n}\n\n\ntemplate <size_t k, typename MEMCMP>\nsize_t FORCE_INLINE swar64_strstr_memcmp(const char* s, size_t n, const char* needle, MEMCMP memcmp_fun) {\n\n    assert(n > 0);\n\n    const uint64_t first = 0x0101010101010101llu * static_cast<uint8_t>(needle[0]);\n    const uint64_t last  = 0x0101010101010101llu * static_cast<uint8_t>(needle[k - 1]);\n\n    uint64_t* block_first = reinterpret_cast<uint64_t*>(const_cast<char*>(s));\n    uint64_t* block_last  = reinterpret_cast<uint64_t*>(const_cast<char*>(s + k - 1));\n\n    // 2. sequence scan\n    for (auto i=0u; i < n; i+=8, block_first++, block_last++) {\n        const uint64_t eq = (*block_first ^ first) | (*block_last ^ last);\n        const uint64_t t0 = (~eq & 0x7f7f7f7f7f7f7f7fllu) + 0x0101010101010101llu;\n        const uint64_t t1 = (~eq & 0x8080808080808080llu);\n        uint64_t zeros = t0 & t1;\n        size_t j = 0;\n    \n        while (zeros) {\n            if (zeros & 0x80) {\n                const char* substr = reinterpret_cast<char*>(block_first) + j + 1;\n                if (memcmp_fun(substr, needle + 1)) {\n                    return i + j;\n                }\n            }\n\n            zeros >>= 8;\n            j += 1;\n        }\n    }\n\n    return std::string::npos;\n}\n\n// ------------------------------------------------------------------------\n\nsize_t swar64_strstr_v2(const char* s, size_t n, const char* needle, size_t k) {\n\n    size_t result = std::string::npos;\n\n    if (n < k) {\n        return result;\n    }\n\n\tswitch (k) {\n\t\tcase 0:\n\t\t\treturn 0;\n\n\t\tcase 1: {\n            const char* res = reinterpret_cast<const char*>(strchr(s, needle[0]));\n\n\t\t\treturn (res != nullptr) ? res - s : std::string::npos;\n            }\n\n        case 2:\n            result = swar64_strstr_memcmp<2>(s, n, needle, always_true);\n            break;\n\n        case 3:\n            result = swar64_strstr_memcmp<3>(s, n, needle, memcmp1);\n            break;\n\n        case 4:\n            result = swar64_strstr_memcmp<4>(s, n, needle, memcmp2);\n            break;\n\n        case 5:\n            // Note: use memcmp4 rather memcmp3, as the last character\n            //       of needle is already proven to be equal\n            result = swar64_strstr_memcmp<5>(s, n, needle, memcmp4);\n            break;\n\n        case 6:\n            result = swar64_strstr_memcmp<6>(s, n, needle, memcmp4);\n            break;\n\n        case 7:\n            result = swar64_strstr_memcmp<7>(s, n, needle, memcmp5);\n            break;\n\n        case 8:\n            result = swar64_strstr_memcmp<8>(s, n, needle, memcmp6);\n            break;\n\n        case 9:\n            // Note: use memcmp8 rather memcmp7 for the same reason as above.\n            result = swar64_strstr_memcmp<9>(s, n, needle, memcmp8);\n            break;\n\n        case 10:\n            result = swar64_strstr_memcmp<10>(s, n, needle, memcmp8);\n            break;\n\n        case 11:\n            result = swar64_strstr_memcmp<11>(s, n, needle, memcmp9);\n            break;\n\n        case 12:\n            result = swar64_strstr_memcmp<12>(s, n, needle, memcmp10);\n            break;\n\n\t\tdefault:\n\t\t\tresult = swar64_strstr_anysize(s, n, needle, k);\n            break;\n    }\n\n    if (result <= n - k) {\n        return result;\n    } else {\n        return std::string::npos;\n    }\n}\n\n\nsize_t swar64_strstr_v2(const std::string& s, const std::string& needle) {\n\n    return swar64_strstr_v2(s.data(), s.size(), needle.data(), needle.size());\n}\n"
  },
  {
    "path": "utils/ansi.cpp",
    "content": "namespace ansi {\n    \n    const int RED   = 31;\n    const int GREEN = 32;\n    const int WHITE = 37;\n\n    std::string seq(const std::string& str, int color) {\n\n        return \"\\033[\" + std::to_string(color) + \"m\" + str + \"\\033[0m\";\n    }\n\n} // namespace ansi\n\n"
  },
  {
    "path": "utils/avx2.cpp",
    "content": "namespace avx2 {\n\n    union proxy {\n        __m256i  vec;\n        uint8_t  u8[32];\n        uint16_t u16[16];\n    };\n\n\n    namespace dump {\n\n        void epu16(const __m256i vec) {\n            \n            proxy p;\n            p.vec = vec;\n\n            for (int i=0; i < 16; i++) {\n                printf(\"%04x \", p.u16[i]);\n            }\n\n            putchar('\\n');\n        }\n\n        void epu8(const __m256i vec) {\n            \n            proxy p;\n            p.vec = vec;\n\n            putchar('\\'');\n            for (int i=0; i < 32; i++) {\n                printf(\"%02x \", p.u8[i]);\n            }\n\n            putchar('\\'');\n            putchar('\\n');\n        }\n\n    } // namespace dump\n\n} // namespace sse\n"
  },
  {
    "path": "utils/avx512.cpp",
    "content": "namespace avx512 {\n\n    union proxy {\n        __m512i  vec;\n        uint8_t  u8[64];\n        uint16_t u16[32];\n    };\n\n\n    namespace dump {\n\n        void epu16(const __m512i vec) {\n            \n            proxy p;\n            p.vec = vec;\n\n            for (int i=0; i < 32; i++) {\n                printf(\"%04x \", p.u16[i]);\n            }\n\n            putchar('\\n');\n        }\n\n        void epu8(const __m512i vec) {\n            \n            proxy p;\n            p.vec = vec;\n\n            putchar('\\'');\n            for (int i=0; i < 64; i++) {\n                printf(\"%02x \", p.u8[i]);\n            }\n\n            putchar('\\'');\n            putchar('\\n');\n        }\n\n    } // namespace dump\n\n} // namespace sse\n"
  },
  {
    "path": "utils/bits.cpp",
    "content": "\nnamespace bits {\n\n    template <typename T>\n    T clear_leftmost_set(const T value) {\n\n        assert(value != 0);\n\n        return value & (value - 1);\n    }\n\n\n    template <typename T>\n    unsigned get_first_bit_set(const T value) {\n\n        assert(value != 0);\n\n        return __builtin_ctz(value);\n    }\n\n\n    template <>\n    unsigned get_first_bit_set<uint64_t>(const uint64_t value) {\n\n        assert(value != 0);\n\n        return __builtin_ctzl(value);\n    }\n\n} // namespace bits\n"
  },
  {
    "path": "utils/neon.cpp",
    "content": "namespace neon {\n\n    namespace dump {\n\n        void epu8(const uint8x16_t vec) {\n            \n            uint8_t p[16];\n            vst1q_u8(p, vec);\n\n            putchar('\\'');\n            for (int i=0; i < 16; i++) {\n                printf(\"%02x \", p[i]);\n            }\n\n            putchar('\\'');\n            putchar('\\n');\n        }\n\n        void epu8(const uint8x8_t vec) {\n            \n            uint8_t p[8];\n            vst1_u8(p, vec);\n\n            putchar('\\'');\n            for (int i=0; i < 8; i++) {\n                printf(\"%02x \", p[i]);\n            }\n\n            putchar('\\'');\n            putchar('\\n');\n        }\n\n    } // namespace dump\n\n} // namespace sse\n"
  },
  {
    "path": "utils/sse.cpp",
    "content": "namespace sse {\n\n    template <typename T>\n    __m128i load(T ptr) {\n        \n        return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));\n    }\n\n    __m128i mask_lower_bytes(size_t n) {\n\n        // assert(n < 16)\n\n        static const uint8_t mask[32] = {\n            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \n            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \n            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \n            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \n        };\n\n        return load(mask + 16 - n);\n    }\n\n    __m128i mask_higher_bytes(size_t n) {\n\n        // assert(n < 16)\n\n        static const uint8_t mask[32] = {\n            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \n            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, \n            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \n            0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, \n        };\n\n        return load(mask + 16 - n);\n    }\n\n\n    union proxy {\n        __m128i  vec;\n        uint8_t  u8[16];\n        uint16_t u16[8];\n    };\n\n\n    namespace dump {\n\n        void epu16(const __m128i vec) {\n            \n            proxy p;\n            p.vec = vec;\n\n            for (int i=0; i < 8; i++) {\n                printf(\"%04x \", p.u16[i]);\n            }\n\n            putchar('\\n');\n        }\n\n        void epu8(const __m128i vec) {\n            \n            proxy p;\n            p.vec = vec;\n\n            putchar('\\'');\n            for (int i=0; i < 16; i++) {\n                printf(\"%02x \", p.u8[i]);\n            }\n\n            putchar('\\'');\n            putchar('\\n');\n        }\n\n    } // namespace dump\n\n} // namespace sse\n"
  }
]