Repository: IlyaGrebnov/libsais Branch: master Commit: b6e52ef33fe1 Files: 14 Total size: 1.6 MB Directory structure: gitextract__tus48nc/ ├── Benchmarks.md ├── CHANGES ├── CMakeLists.txt ├── LICENSE ├── README.md ├── VERSION ├── include/ │ ├── libsais.h │ ├── libsais16.h │ ├── libsais16x64.h │ └── libsais64.h └── src/ ├── libsais.c ├── libsais16.c ├── libsais16x64.c └── libsais64.c ================================================ FILE CONTENTS ================================================ ================================================ FILE: Benchmarks.md ================================================ # Specifications * OS: Windows 11 Pro (64-bit) * CPU: AMD Ryzen 9 9950X3D (16C / 32T, 128MB L3 cache, PBO +200Mhz, CO -10 all-core) * RAM: 2 x 48 GB DDR5-6000 (28-36-36-60) * Compiler: clang 19.1.5 '-O3 -march=znver5 -fopenmp -DLIBSAIS_OPENMP -DNDEBUG' The times reflect the median of 15 runs for suffix array construction, reported in both **single-threaded (ST)** and **multi-threaded (MT)** modes. For optimal performance, libsais was limited to 12 threads, while divsufsort was limited to 16 threads. ### [Silesia Corpus](https://www.data-compression.info/Corpora/SilesiaCorpus/index.html) ### | file | size | libsais 2.10.4 (ST) | divsufsort 2.0.2 (ST) |speedup (ST)| libsais 2.10.4 (MT) | divsufsort 2.0.2 (MT) |speedup (MT)| |:---------------:|:-----------:|:--------------------------:|:--------------------------:|:----------:|:--------------------------:|:--------------------------:|:----------:| | dickens | 10192446 | 0.108 sec ( 94.52 MB/s) | 0.297 sec ( 34.33 MB/s) |**+175.35%**| 0.059 sec ( 173.57 MB/s) | 0.195 sec ( 52.21 MB/s) |**+232.45%**| | mozilla | 51220480 | 0.542 sec ( 94.55 MB/s) | 1.163 sec ( 44.04 MB/s) |**+114.70%**| 0.318 sec ( 161.13 MB/s) | 0.725 sec ( 70.69 MB/s) |**+127.94%**| | mr | 9970564 | 0.095 sec ( 104.55 MB/s) | 0.239 sec ( 41.66 MB/s) |**+150.96%**| 0.069 sec ( 145.10 MB/s) | 0.163 sec ( 61.28 MB/s) |**+136.81%**| | nci | 33553445 | 0.274 sec ( 122.48 MB/s) | 0.682 sec ( 49.20 MB/s) |**+148.97%**| 0.142 sec ( 235.87 MB/s) | 0.605 sec ( 55.43 MB/s) |**+325.56%**| | ooffice | 6152192 | 0.072 sec ( 85.36 MB/s) | 0.130 sec ( 47.18 MB/s) | **+80.94%**| 0.052 sec ( 119.34 MB/s) | 0.081 sec ( 76.34 MB/s) | **+56.32%**| | osdb | 10085684 | 0.110 sec ( 91.44 MB/s) | 0.214 sec ( 47.04 MB/s) | **+94.40%**| 0.073 sec ( 138.57 MB/s) | 0.168 sec ( 60.01 MB/s) |**+130.91%**| | reymont | 6627202 | 0.069 sec ( 95.98 MB/s) | 0.173 sec ( 38.36 MB/s) |**+150.23%**| 0.042 sec ( 157.05 MB/s) | 0.129 sec ( 51.38 MB/s) |**+205.66%**| | samba | 21606400 | 0.208 sec ( 103.82 MB/s) | 0.440 sec ( 49.15 MB/s) |**+111.24%**| 0.133 sec ( 163.02 MB/s) | 0.304 sec ( 71.01 MB/s) |**+129.56%**| | sao | 7251944 | 0.107 sec ( 67.75 MB/s) | 0.154 sec ( 47.08 MB/s) | **+43.89%**| 0.079 sec ( 91.84 MB/s) | 0.095 sec ( 75.99 MB/s) | **+20.85%**| | webster | 41458703 | 0.445 sec ( 93.24 MB/s) | 1.335 sec ( 31.04 MB/s) |**+200.33%**| 0.243 sec ( 170.50 MB/s) | 0.944 sec ( 43.93 MB/s) |**+288.10%**| | x-ray | 8474240 | 0.139 sec ( 60.85 MB/s) | 0.219 sec ( 38.61 MB/s) | **+57.62%**| 0.093 sec ( 91.56 MB/s) | 0.106 sec ( 80.02 MB/s) | **+14.43%**| | xml | 5345280 | 0.044 sec ( 122.71 MB/s) | 0.097 sec ( 55.15 MB/s) |**+122.52%**| 0.029 sec ( 182.47 MB/s) | 0.077 sec ( 69.57 MB/s) |**+162.29%**| ### [Large Canterbury Corpus](https://www.data-compression.info/Corpora/CanterburyCorpus/) ### | file | size | libsais 2.10.4 (ST) | divsufsort 2.0.2 (ST) |speedup (ST)| libsais 2.10.4 (MT) | divsufsort 2.0.2 (MT) |speedup (MT)| |:---------------:|:-----------:|:--------------------------:|:--------------------------:|:----------:|:--------------------------:|:--------------------------:|:----------:| | bible.txt | 4047392 | 0.040 sec ( 100.99 MB/s) | 0.106 sec ( 38.15 MB/s) |**+164.69%**| 0.026 sec ( 156.84 MB/s) | 0.075 sec ( 53.85 MB/s) |**+191.28%**| | E.coli | 4638690 | 0.052 sec ( 89.05 MB/s) | 0.156 sec ( 29.70 MB/s) |**+199.79%**| 0.024 sec ( 192.42 MB/s) | 0.123 sec ( 37.71 MB/s) |**+410.27%**| | world192.txt | 2473400 | 0.023 sec ( 108.73 MB/s) | 0.056 sec ( 43.92 MB/s) |**+147.54%**| 0.017 sec ( 149.17 MB/s) | 0.037 sec ( 66.84 MB/s) |**+123.17%**| ### [Manzini Corpus](https://people.unipmn.it/manzini/lightweight/corpus/) ### | file | size | libsais 2.10.4 (ST) | divsufsort 2.0.2 (ST) |speedup (ST)| libsais 2.10.4 (MT) | divsufsort 2.0.2 (MT) |speedup (MT)| |:---------------:|:-----------:|:--------------------------:|:--------------------------:|:----------:|:--------------------------:|:--------------------------:|:----------:| | chr22.dna | 34553758 | 0.400 sec ( 86.39 MB/s) | 1.252 sec ( 27.61 MB/s) |**+212.96%**| 0.177 sec ( 195.37 MB/s) | 1.024 sec ( 33.73 MB/s) |**+479.24%**| | etext99 | 105277340 | 1.341 sec ( 78.51 MB/s) | 4.171 sec ( 25.24 MB/s) |**+211.02%**| 0.776 sec ( 135.69 MB/s) | 2.855 sec ( 36.87 MB/s) |**+268.00%**| | gcc-3.0.tar | 86630400 | 0.909 sec ( 95.30 MB/s) | 2.332 sec ( 37.15 MB/s) |**+156.53%**| 0.529 sec ( 163.65 MB/s) | 1.614 sec ( 53.67 MB/s) |**+204.93%**| | howto | 39422105 | 0.435 sec ( 90.63 MB/s) | 1.235 sec ( 31.91 MB/s) |**+183.99%**| 0.237 sec ( 166.57 MB/s) | 0.794 sec ( 49.66 MB/s) |**+235.45%**| | jdk13c | 69728899 | 0.692 sec ( 100.74 MB/s) | 1.864 sec ( 37.40 MB/s) |**+169.37%**| 0.412 sec ( 169.13 MB/s) | 1.495 sec ( 46.63 MB/s) |**+262.72%**| | linux-2.4.5.tar | 116254720 | 1.281 sec ( 90.74 MB/s) | 3.380 sec ( 34.39 MB/s) |**+163.81%**| 0.898 sec ( 129.48 MB/s) | 2.409 sec ( 48.26 MB/s) |**+168.31%**| | rctail96 | 114711151 | 1.314 sec ( 87.30 MB/s) | 3.790 sec ( 30.27 MB/s) |**+188.41%**| 0.935 sec ( 122.65 MB/s) | 2.945 sec ( 38.95 MB/s) |**+214.89%**| | rfc | 116421901 | 1.269 sec ( 91.75 MB/s) | 3.601 sec ( 32.33 MB/s) |**+183.84%**| 0.887 sec ( 131.32 MB/s) | 2.626 sec ( 44.33 MB/s) |**+196.21%**| | sprot34.dat | 109617186 | 1.242 sec ( 88.26 MB/s) | 3.739 sec ( 29.32 MB/s) |**+201.01%**| 0.848 sec ( 129.27 MB/s) | 2.532 sec ( 43.30 MB/s) |**+198.56%**| | w3c2 | 104201579 | 1.083 sec ( 96.21 MB/s) | 2.848 sec ( 36.59 MB/s) |**+162.95%**| 0.809 sec ( 128.80 MB/s) | 2.326 sec ( 44.80 MB/s) |**+187.50%**| ### [Large Text Compression Benchmark Corpus](https://www.mattmahoney.net/dc/textdata.html) ### | file | size | libsais 2.10.4 (ST) | divsufsort 2.0.2 (ST) |speedup (ST)| libsais 2.10.4 (MT) | divsufsort 2.0.2 (MT) |speedup (MT)| |:---------------:|:-----------:|:--------------------------:|:--------------------------:|:----------:|:--------------------------:|:--------------------------:|:----------:| | enwik8 | 100000000 | 1.224 sec ( 81.69 MB/s) | 3.785 sec ( 26.42 MB/s) |**+209.19%**| 0.827 sec ( 120.98 MB/s) | 2.568 sec ( 38.94 MB/s) |**+210.67%**| | enwik9 | 1000000000 | 13.633 sec ( 73.35 MB/s) | 42.792 sec ( 23.37 MB/s) |**+213.88%**| 9.410 sec ( 106.27 MB/s) | 29.223 sec ( 34.22 MB/s) |**+210.56%**| ### [The Gauntlet Corpus](https://github.com/michaelmaniscalco/gauntlet_corpus) ### | file | size | libsais 2.10.4 (ST) | divsufsort 2.0.2 (ST) |speedup (ST)| libsais 2.10.4 (MT) | divsufsort 2.0.2 (MT) |speedup (MT)| |:---------------:|:-----------:|:--------------------------:|:--------------------------:|:----------:|:--------------------------:|:--------------------------:|:----------:| | abac | 200000 | 0.002 sec ( 120.88 MB/s) | 0.001 sec ( 156.91 MB/s) | -22.97% | 0.001 sec ( 193.69 MB/s) | 0.001 sec ( 151.70 MB/s) | **+27.68%**| | abba | 10500596 | 0.070 sec ( 150.70 MB/s) | 0.267 sec ( 39.27 MB/s) |**+283.78%**| 0.046 sec ( 228.50 MB/s) | 0.297 sec ( 35.37 MB/s) |**+546.11%**| | book1x20 | 15375420 | 0.138 sec ( 111.81 MB/s) | 0.477 sec ( 32.21 MB/s) |**+247.16%**| 0.086 sec ( 178.20 MB/s) | 0.330 sec ( 46.62 MB/s) |**+282.23%**| | fib_s14930352 | 14930352 | 0.122 sec ( 122.45 MB/s) | 0.537 sec ( 27.79 MB/s) |**+340.60%**| 0.082 sec ( 183.15 MB/s) | 0.594 sec ( 25.14 MB/s) |**+628.64%**| | fss10 | 12078908 | 0.091 sec ( 133.24 MB/s) | 0.406 sec ( 29.73 MB/s) |**+348.11%**| 0.063 sec ( 192.55 MB/s) | 0.463 sec ( 26.10 MB/s) |**+637.75%**| | fss9 | 2851443 | 0.026 sec ( 109.23 MB/s) | 0.098 sec ( 29.10 MB/s) |**+275.34%**| 0.015 sec ( 192.40 MB/s) | 0.104 sec ( 27.47 MB/s) |**+600.49%**| | houston | 3839141 | 0.023 sec ( 165.51 MB/s) | 0.018 sec ( 212.46 MB/s) | -22.10% | 0.012 sec ( 333.62 MB/s) | 0.018 sec ( 219.29 MB/s) | **+52.14%**| | paper5x80 | 956322 | 0.009 sec ( 105.78 MB/s) | 0.018 sec ( 53.26 MB/s) | **+98.60%**| 0.006 sec ( 173.22 MB/s) | 0.014 sec ( 67.59 MB/s) |**+156.27%**| | test1 | 2097152 | 0.026 sec ( 81.67 MB/s) | 0.042 sec ( 50.29 MB/s) | **+62.39%**| 0.014 sec ( 147.02 MB/s) | 0.041 sec ( 51.48 MB/s) |**+185.60%**| | test2 | 2097152 | 0.026 sec ( 81.80 MB/s) | 0.032 sec ( 66.13 MB/s) | **+23.70%**| 0.014 sec ( 149.18 MB/s) | 0.032 sec ( 66.40 MB/s) |**+124.69%**| | test3 | 2097088 | 0.023 sec ( 91.81 MB/s) | 0.034 sec ( 62.20 MB/s) | **+47.61%**| 0.024 sec ( 87.59 MB/s) | 0.038 sec ( 55.21 MB/s) | **+58.65%**| ### [Pizza & Chilli Corpus](https://pizzachili.dcc.uchile.cl/texts.html) ### | file | size | libsais 2.10.4 (ST) | divsufsort 2.0.2 (ST) |speedup (ST)| libsais 2.10.4 (MT) | divsufsort 2.0.2 (MT) |speedup (MT)| |:---------------:|:-----------:|:--------------------------:|:--------------------------:|:----------:|:--------------------------:|:--------------------------:|:----------:| | dblp.xml | 296135874 | 3.422 sec ( 86.54 MB/s) | 9.243 sec ( 32.04 MB/s) |**+170.10%**| 2.568 sec ( 115.33 MB/s) | 7.178 sec ( 41.26 MB/s) |**+179.53%**| | dna | 403927746 | 5.691 sec ( 70.98 MB/s) | 19.028 sec ( 21.23 MB/s) |**+234.35%**| 3.476 sec ( 116.21 MB/s) | 16.365 sec ( 24.68 MB/s) |**+370.82%**| | english.1024MB | 1073741824 | 16.110 sec ( 66.65 MB/s) | 52.004 sec ( 20.65 MB/s) |**+222.81%**| 11.219 sec ( 95.71 MB/s) | 37.177 sec ( 28.88 MB/s) |**+231.37%**| | pitches | 55832855 | 0.690 sec ( 80.96 MB/s) | 1.656 sec ( 33.71 MB/s) |**+140.12%**| 0.462 sec ( 120.98 MB/s) | 1.024 sec ( 54.51 MB/s) |**+121.96%**| | proteins | 1184051855 | 18.368 sec ( 64.46 MB/s) | 66.699 sec ( 17.75 MB/s) |**+263.13%**| 12.717 sec ( 93.10 MB/s) | 35.746 sec ( 33.12 MB/s) |**+181.08%**| | sources | 210866607 | 2.469 sec ( 85.40 MB/s) | 6.702 sec ( 31.46 MB/s) |**+171.45%**| 1.780 sec ( 118.43 MB/s) | 4.741 sec ( 44.48 MB/s) |**+166.27%**| ### [Pizza & Chilli Repetitive Corpus](https://pizzachili.dcc.uchile.cl/repcorpus.html) ### | file | size | libsais 2.10.4 (ST) | divsufsort 2.0.2 (ST) |speedup (ST)| libsais 2.10.4 (MT) | divsufsort 2.0.2 (MT) |speedup (MT)| |:---------------:|:-----------:|:--------------------------:|:--------------------------:|:----------:|:--------------------------:|:--------------------------:|:----------:| | cere | 461286644 | 5.356 sec ( 86.12 MB/s) | 19.154 sec ( 24.08 MB/s) |**+257.61%**| 3.597 sec ( 128.23 MB/s) | 16.417 sec ( 28.10 MB/s) |**+356.39%**| | coreutils | 205281778 | 2.181 sec ( 94.11 MB/s) | 6.814 sec ( 30.13 MB/s) |**+212.39%**| 1.626 sec ( 126.26 MB/s) | 5.216 sec ( 39.36 MB/s) |**+220.81%**| | einstein.de.txt | 92758441 | 0.926 sec ( 100.22 MB/s) | 2.676 sec ( 34.66 MB/s) |**+189.17%**| 0.635 sec ( 146.08 MB/s) | 2.167 sec ( 42.81 MB/s) |**+241.27%**| | einstein.en.txt | 467626544 | 5.480 sec ( 85.33 MB/s) | 15.822 sec ( 29.56 MB/s) |**+188.72%**| 3.885 sec ( 120.36 MB/s) | 12.593 sec ( 37.13 MB/s) |**+224.13%**| |Escherichia_Coli | 112689515 | 1.272 sec ( 88.61 MB/s) | 4.562 sec ( 24.70 MB/s) |**+258.72%**| 0.864 sec ( 130.37 MB/s) | 3.827 sec ( 29.44 MB/s) |**+342.80%**| | influenza | 154808555 | 1.588 sec ( 97.51 MB/s) | 5.325 sec ( 29.07 MB/s) |**+235.42%**| 1.159 sec ( 133.60 MB/s) | 4.619 sec ( 33.51 MB/s) |**+298.67%**| | kernel | 257961616 | 2.747 sec ( 93.90 MB/s) | 8.993 sec ( 28.69 MB/s) |**+227.33%**| 2.030 sec ( 127.09 MB/s) | 6.614 sec ( 39.00 MB/s) |**+225.88%**| | para | 429265758 | 5.129 sec ( 83.69 MB/s) | 18.302 sec ( 23.45 MB/s) |**+256.81%**| 3.444 sec ( 124.63 MB/s) | 15.618 sec ( 27.49 MB/s) |**+353.45%**| | world_leaders | 46968181 | 0.338 sec ( 138.85 MB/s) | 0.758 sec ( 62.00 MB/s) |**+123.96%**| 0.202 sec ( 232.62 MB/s) | 0.637 sec ( 73.76 MB/s) |**+215.37%**| |dblp.xml.00001.1 | 104857600 | 1.944 sec ( 53.94 MB/s) | 3.809 sec ( 27.53 MB/s) | **+95.95%**| 0.848 sec ( 123.66 MB/s) | 3.339 sec ( 31.40 MB/s) |**+293.78%**| |dblp.xml.00001.2 | 104857600 | 1.922 sec ( 54.57 MB/s) | 3.825 sec ( 27.42 MB/s) | **+99.02%**| 0.845 sec ( 124.11 MB/s) | 3.358 sec ( 31.22 MB/s) |**+297.49%**| | dblp.xml.0001.1 | 104857600 | 1.876 sec ( 55.89 MB/s) | 3.753 sec ( 27.94 MB/s) |**+100.03%**| 0.845 sec ( 124.08 MB/s) | 3.272 sec ( 32.05 MB/s) |**+287.18%**| | dblp.xml.0001.2 | 104857600 | 1.840 sec ( 56.98 MB/s) | 3.743 sec ( 28.01 MB/s) |**+103.42%**| 0.844 sec ( 124.26 MB/s) | 3.242 sec ( 32.35 MB/s) |**+284.16%**| | dna.001.1 | 104857600 | 1.957 sec ( 53.59 MB/s) | 4.475 sec ( 23.43 MB/s) |**+128.73%**| 0.785 sec ( 133.57 MB/s) | 3.835 sec ( 27.34 MB/s) |**+388.49%**| | english.001.2 | 104857600 | 2.035 sec ( 51.53 MB/s) | 4.360 sec ( 24.05 MB/s) |**+114.26%**| 0.914 sec ( 114.76 MB/s) | 3.242 sec ( 32.35 MB/s) |**+254.80%**| | proteins.001.1 | 104857600 | 2.001 sec ( 52.41 MB/s) | 4.486 sec ( 23.37 MB/s) |**+124.24%**| 0.871 sec ( 120.43 MB/s) | 2.880 sec ( 36.40 MB/s) |**+230.82%**| | sources.001.2 | 104857600 | 1.777 sec ( 59.02 MB/s) | 3.649 sec ( 28.74 MB/s) |**+105.40%**| 0.847 sec ( 123.79 MB/s) | 2.940 sec ( 35.67 MB/s) |**+247.04%**| | fib41 | 267914296 | 3.362 sec ( 79.68 MB/s) | 16.837 sec ( 15.91 MB/s) |**+400.76%**| 2.082 sec ( 128.66 MB/s) | 17.004 sec ( 15.76 MB/s) |**+716.58%**| | rs.13 | 216747218 | 2.292 sec ( 94.57 MB/s) | 13.105 sec ( 16.54 MB/s) |**+471.82%**| 1.671 sec ( 129.71 MB/s) | 13.311 sec ( 16.28 MB/s) |**+696.59%**| | tm29 | 268435456 | 8.414 sec ( 31.90 MB/s) | 19.460 sec ( 13.79 MB/s) |**+131.29%**| 2.407 sec ( 111.51 MB/s) | 20.004 sec ( 13.42 MB/s) |**+730.96%**| ### [Skyline Corpus](http://panthema.net/2012/1119-eSAIS-Inducing-Suffix-and-LCP-Arrays-in-External-Memory/eSAIS-DC3-LCP-0.5.0/src/input/skyline.h.html) ### | file | size | libsais 2.10.4 (ST) | divsufsort 2.0.2 (ST) |speedup (ST)| libsais 2.10.4 (MT) | divsufsort 2.0.2 (MT) |speedup (MT)| |:---------------:|:-----------:|:--------------------------:|:--------------------------:|:----------:|:--------------------------:|:--------------------------:|:----------:| | skyline20.txt | 1048575 | 0.016 sec ( 64.17 MB/s) | 0.055 sec ( 18.92 MB/s) |**+239.18%**| 0.011 sec ( 97.18 MB/s) | 0.055 sec ( 19.15 MB/s) |**+407.38%**| | skyline22.txt | 4194303 | 0.068 sec ( 61.81 MB/s) | 0.260 sec ( 16.16 MB/s) |**+282.46%**| 0.033 sec ( 127.33 MB/s) | 0.276 sec ( 15.20 MB/s) |**+737.65%**| | skyline24.txt | 16777215 | 0.375 sec ( 44.76 MB/s) | 1.355 sec ( 12.38 MB/s) |**+261.57%**| 0.140 sec ( 119.44 MB/s) | 1.376 sec ( 12.19 MB/s) |**+879.45%**| | skyline26.txt | 67108863 | 2.862 sec ( 23.45 MB/s) | 7.519 sec ( 8.93 MB/s) |**+162.71%**| 0.772 sec ( 86.97 MB/s) | 7.566 sec ( 8.87 MB/s) |**+880.50%**| | skyline28.txt | 268435455 | 12.159 sec ( 22.08 MB/s) | 36.970 sec ( 7.26 MB/s) |**+204.06%**| 3.827 sec ( 70.15 MB/s) | 36.876 sec ( 7.28 MB/s) |**+863.64%**| ## Additional memory The libsais reuses the space allocated for the suffix array during construction. In rare cases, this space is not large enough for the fastest algorithm and libsais will need to fallback to less efficient one (libsais has 4 algorithms at different break-points point: 6k, 4k, 2k and 1k; where k is alphabet size). To improve performance for those cases you could allocating additional space at the end of suffix array. | file | size | libsais + O(n) (ST) | libsais + O(1) (ST) |speedup (ST)| libsais + O(n) (MT) | libsais + O(1) (MT) |speedup (MT)| |:---------------:|:-----------:|:--------------------------:|:--------------------------:|:----------:|:--------------------------:|:--------------------------:|:----------:| | osdb | 10085684 | 0.100 sec ( 100.68 MB/s) | 0.107 sec ( 94.43 MB/s) | **+6.61%**| 0.071 sec ( 142.67 MB/s) | 0.078 sec ( 128.57 MB/s) | **+10.96%**| | x-ray | 8474240 | 0.112 sec ( 75.75 MB/s) | 0.138 sec ( 61.20 MB/s) | **+23.77%**| 0.070 sec ( 121.65 MB/s) | 0.093 sec ( 91.46 MB/s) | **+33.01%**| | sao | 7251944 | 0.091 sec ( 80.02 MB/s) | 0.106 sec ( 68.19 MB/s) | **+17.35%**| 0.068 sec ( 107.05 MB/s) | 0.080 sec ( 90.48 MB/s) | **+18.32%**| | ooffice | 6152192 | 0.065 sec ( 94.39 MB/s) | 0.071 sec ( 86.20 MB/s) | **+9.49%**| 0.048 sec ( 127.93 MB/s) | 0.055 sec ( 111.39 MB/s) | **+14.85%**| | test3 | 2097088 | 0.017 sec ( 125.08 MB/s) | 0.019 sec ( 109.88 MB/s) | **+13.83%**| 0.015 sec ( 135.61 MB/s) | 0.018 sec ( 114.18 MB/s) | **+18.76%**| > * All other files from [Benchmarks](#benchmarks) above do not suffer from this fallbacks. ================================================ FILE: CHANGES ================================================ Changes in 2.10.4 (September 1, 2025) - Tuned prefetch distance for improved throughput. Changes in 2.10.3 (August 12, 2025) - No functional changes, added CMake install and export package rules. Changes in 2.10.2 (June 10, 2025) - Improved performance of suffix array and burrows wheeler transform construction on degenerate inputs. Changes in 2.10.1 (May 11, 2025) - No functional changes, slightly improved performance. Changes in 2.10.0 (April 12, 2025) - Improved performance, with noticeable gains on ARM architecture. - Fixed compiler warnings and addressed undefined behavior. Changes in 2.9.1 (March 19, 2025) - No functional changes, resolved compiler warnings & undefined behavior. Changes in 2.9.0 (March 16, 2025) - Support for generalized suffix array (GSA) construction. - Support for longest common prefix array (LCP) construction for generalized suffix array (GSA). Changes in 2.8.7 (January 16, 2025) - Restore the input array after suffix array construction (libsais64 & libsais16x64). Changes in 2.8.6 (November 18, 2024) - Fixed out-of-bound memory access issue for large inputs. Changes in 2.8.5 (July 31, 2024) - Miscellaneous changes to reduce compiler warnings about implicit functions. Changes in 2.8.4 (June 13, 2024) - Additional OpenMP acceleration (libsais16 & libsais16x64). Changes in 2.8.3 (June 11, 2024) - Implemented suffix array construction of a long 16-bit array (libsais16x64). Changes in 2.8.2 (May 27, 2024) - Implemented suffix array construction of a long 64-bit array (libsais64). Changes in 2.8.1 (April 5, 2024) - Fixed out-of-bound memory access issue for large inputs (libsais64). Changes in 2.8.0 (March 3, 2024) - Implemented permuted longest common prefix array (PLCP) construction of an integer array. - Fixed compilation error when compiling the library with OpenMP enabled. Changes in 2.7.5 (February 26, 2024) - Improved performance of suffix array and burrows wheeler transform construction on degenerate inputs. Changes in 2.7.4 (February 23, 2024) - Resolved strict aliasing violation resulted in invalid code generation by Intel compiler. Changes in 2.7.3 (April 21, 2023) - CMake script for library build and integration with other projects. Changes in 2.7.2 (April 18, 2023) - Fixed out-of-bound memory access issue for large inputs (libsais64). Changes in 2.7.1 (June 19, 2022) - Improved cache coherence for ARMv8 architecture. Changes in 2.7.0 (April 12, 2022) - Support for longest common prefix array (LCP) construction. Changes in 2.6.5 (January 1, 2022) - Exposed functions to construct suffix array of a given integer array. - Improved detection of various compiler intrinsics. - Capped free space parameter to avoid crashing due to 32-bit integer overflow. Changes in 2.6.0 (October 21, 2021) - libsais16 for 16-bit inputs. Changes in 2.5.0 (October 15, 2021) - Support for optional symbol frequency tables. Changes in 2.4.0 (July 14, 2021) - Reverse Burrows-Wheeler transform. Changes in 2.3.0 (June 23, 2021) - Burrows-Wheeler transform with auxiliary indexes. Changes in 2.2.0 (April 27, 2021) - libsais64 for inputs larger than 2GB. Changes in 2.1.0 (April 19, 2021) - Additional OpenMP acceleration. Changes in 2.0.0 (April 4, 2021) - OpenMP acceleration. Changes in 1.0.0 (February 23, 2021) - Initial Release. ================================================ FILE: CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.10) project( libsais VERSION 2.10.4 LANGUAGES C DESCRIPTION "The libsais library provides fast linear-time construction of suffix array (SA), generalized suffix array (GSA), longest common prefix (LCP) array, permuted LCP (PLCP) array, Burrows-Wheeler transform (BWT) and inverse BWT based on the induced sorting algorithm with optional OpenMP support for multi-core parallel construction." ) include(GNUInstallDirs) include(CMakePackageConfigHelpers) set(CMAKE_C_STANDARD 99) set(CMAKE_C_STANDARD_REQUIRED ON) set(CMAKE_C_EXTENSIONS OFF) option(LIBSAIS_USE_OPENMP "Use OpenMP for parallelization" OFF) option(LIBSAIS_BUILD_SHARED_LIB "Build libsais as a shared library" OFF) if(LIBSAIS_BUILD_SHARED_LIB) set(LIBSAIS_LIBRARY_TYPE SHARED) else() set(LIBSAIS_LIBRARY_TYPE STATIC) endif() add_library(libsais ${LIBSAIS_LIBRARY_TYPE}) set_target_properties(libsais PROPERTIES PREFIX "" IMPORT_PREFIX "" VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR} POSITION_INDEPENDENT_CODE ON ) target_sources(libsais PRIVATE include/libsais.h include/libsais16.h include/libsais16x64.h include/libsais64.h src/libsais.c src/libsais16.c src/libsais16x64.c src/libsais64.c ) if(LIBSAIS_USE_OPENMP) find_package(OpenMP REQUIRED) endif() if(LIBSAIS_USE_OPENMP AND OpenMP_C_FOUND) target_compile_definitions(libsais PUBLIC LIBSAIS_OPENMP) target_link_libraries(libsais PUBLIC OpenMP::OpenMP_C) endif() if(LIBSAIS_BUILD_SHARED_LIB) target_compile_definitions(libsais PUBLIC LIBSAIS_SHARED) target_compile_definitions(libsais PRIVATE LIBSAIS_EXPORTS) endif() target_include_directories(libsais PUBLIC $ $ ) install( TARGETS libsais EXPORT libsaisTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} INCLUDES DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) install( DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) install( EXPORT libsaisTargets FILE libsaisConfig.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/libsais ) write_basic_package_version_file( "${CMAKE_CURRENT_BINARY_DIR}/libsaisConfigVersion.cmake" VERSION ${PROJECT_VERSION} COMPATIBILITY SameMajorVersion ) install( FILES "${CMAKE_CURRENT_BINARY_DIR}/libsaisConfigVersion.cmake" DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/libsais ) ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # libsais The libsais library provides fast (see [Benchmarks](#benchmarks) below) linear-time construction of suffix array (SA), generalized suffix array (GSA), longest common prefix (LCP) array, permuted LCP (PLCP) array, Burrows-Wheeler transform (BWT) and inverse BWT, based on the induced sorting algorithm described in the following papers (with optional OpenMP support for multi-threaded parallel construction): * Ge Nong, Sen Zhang, Wai Hong Chan *Two Efficient Algorithms for Linear Suffix Array Construction*, 2009 * Juha Karkkainen, Giovanni Manzini, Simon J. Puglisi *Permuted Longest-Common-Prefix Array*, 2009 * Nataliya Timoshevskaya, Wu-chun Feng *SAIS-OPT: On the characterization and optimization of the SA-IS algorithm for suffix array construction*, 2014 * Jing Yi Xie, Ge Nong, Bin Lao, Wentao Xu *Scalable Suffix Sorting on a Multicore Machine*, 2020 Copyright (c) 2021-2025 Ilya Grebnov >The libsais is inspired by [libdivsufsort](https://github.com/y-256/libdivsufsort), [sais](https://sites.google.com/site/yuta256/sais) libraries by Yuta Mori and [msufsort](https://github.com/michaelmaniscalco/msufsort) by Michael Maniscalco. ## libcubwt If you are looking for even faster construction times, you can try [libcubwt](https://github.com/IlyaGrebnov/libcubwt) a library for GPU-based suffix array, inverse suffix array and Burrows-Wheeler transform construction. ## Introduction The libsais provides simple C99 API to construct suffix array and Burrows-Wheeler transformed string from a given string over constant-size alphabet. The algorithm runs in a linear time using typically only ~16KB of extra memory (with 2n bytes as absolute worst-case; where n is the length of the string). OpenMP acceleration uses 200KB of addition memory per thread. > * The libsais works with compilers from GNU, Microsoft and Intel, but I recommend Clang for best performance. > * The libsais is sensitive to fast memory and software prefetching and might not be suitable for some workloads. Please benchmark yourself. ## License The libsais is released under the [Apache License Version 2.0](LICENSE "Apache license") ## Multi-threading The libsais is memory-bound, so performance scales primarily with memory bandwidth and concurrency, not raw compute. The optimal number of threads for suffix array construction depends on CPU and memory architecture, as different systems (DDR4 vs DDR5, Intel vs AMD, single- vs multi-CCD) saturate memory at different points with maximum throughput generally following the number of memory channels rather than total core count. The x86-64 dual-channel systems typically saturate near 8 threads, but in practice may show good scaling at 6, 12 or even 16 threads. ## Changes _For the full changelog, see [CHANGES](CHANGES)._ * September 1, 2025 (2.10.4) * Tuned prefetch distance for improved throughput. * August 12, 2025 (2.10.3) * No functional changes, added CMake install and export package rules. * June 10, 2025 (2.10.2) * Improved performance of suffix array and burrows wheeler transform construction on degenerate inputs. * May 11, 2025 (2.10.1) * No functional changes, slightly improved performance. * April 12, 2025 (2.10.0) * Improved performance, with noticeable gains on ARM architecture. * Fixed compiler warnings and addressed undefined behavior. * March 19, 2025 (2.9.1) * No functional changes, resolved compiler warnings & undefined behavior. * March 16, 2025 (2.9.0) * Support for generalized suffix array (GSA) construction. * Support for longest common prefix array (LCP) construction for generalized suffix array (GSA). ## Versions of the libsais * [libsais.c](src/libsais.c) (and corresponding [libsais.h](include/libsais.h)) is for suffix array, GSA, PLCP, LCP, forward BWT and reverse BWT construction over 8-bit inputs smaller than 2GB (2147483648 bytes). * [libsais64.c](src/libsais64.c) (and corresponding [libsais64.h](include/libsais64.h)) is optional extension of the library for inputs larger or equlas to 2GB (2147483648 bytes). * This versions of the library could also be used to construct suffix array of an integer array (with a caveat that input array must be mutable). * [libsais16.c](src/libsais16.c) + [libsais16x64.c](src/libsais16x64.c) (and corresponding [libsais16.h](include/libsais16.h) + [libsais16x64.h](include/libsais16x64.h)) is independent version of the library for 16-bit inputs. * This version of the library could also be used to construct suffix array and BWT of a set of strings by adding a unique end-of-string symbol to each string and then computing the result for the concatenated string. ## Examples of APIs (see [libsais.h](include/libsais.h), [libsais16.h](include/libsais16.h), [libsais16x64.h](include/libsais16x64.h) and [libsais64.h](include/libsais64.h) for complete APIs list) ```c /** * Constructs the suffix array of a given string. * @param T [0..n-1] The input string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the suffix array of a given integer array. * Note, during construction input array will be modified, but restored at the end if no errors occurred. * @param T [0..n-1] The input integer array. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the integer array. * @param k The alphabet size of the input integer array. * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance). * @return 0 if no error occurred, -1 or -2 otherwise. */ int32_t libsais_int(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs); /** * Constructs the burrows-wheeler transformed string (BWT) of a given string. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return The primary index if no error occurred, -1 or -2 otherwise. */ int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given string. * @param freq [0..255] The input symbol frequency table (can be NULL). * @param i The primary index. * @return 0 if no error occurred, -1 or -2 otherwise. */ int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); /** * Constructs the permuted longest common prefix array (PLCP) of a given string and a suffix array. * @param T [0..n-1] The input string. * @param SA [0..n-1] The input suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the string and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ int32_t libsais_plcp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n); ``` ## Example installation using [CPM](https://github.com/cpm-cmake/CPM.cmake) ```cmake CPMAddPackage( NAME libsais GITHUB_REPOSITORY IlyaGrebnov/libsais GIT_TAG v2.10.4 OPTIONS "LIBSAIS_USE_OPENMP OFF" "LIBSAIS_BUILD_SHARED_LIB OFF" ) target_link_libraries( libsais) ``` # Algorithm description The libsais uses the SA-IS (Suffix Array Induced Sorting) algorithm to construct both the suffix array and the Burrows-Wheeler transform through recursive decomposition and induced sorting: * Initially, the algorithm classifies each position in a string as either an S-type or an L-type, based on whether the suffix starting at that position is lexicographically smaller or larger than the suffix at the adjacent right position. Positions identified as S-type, which have an adjacent left L-type position, are further categorized as LMS-type (Leftmost S-type) positions. Next, the algorithm splits the input string into LMS substrings, which start at an LMS-type position and extend up to the next adjacent LMS-type position. These LMS substrings are then lexicographically sorted through induced sorting and subsequently replaced in the input string with their corresponding sorted ranks, thus forming a new, compacted string. This compacted string reduces the problem size, enabling the algorithm to perform a recursive decomposition in which it is reapplied to construct the suffix array for the compacted string. And at the end of the recursive call, the suffix array for the input string is constructed from the suffix array of the compacted string using another round of induced sorting. * The induced sorting is a core mechanic of the SA-IS algorithm and is employed twice during each recursive call: initially before the recursive call to establish the order of LMS substrings, and subsequently after the recursive call to finalize the order of the suffixes of the string. This process involves two sequential scans: a left-to-right scan that determines the order of L-type positions based on the LMS-type positions, followed by a right-to-left scan that establishes the order of S-type positions based on L-type positions. These scans efficiently extend the ordering from LMS-type positions to all positions in the string. The SA-IS algorithm is quite elegant, yet implementing it efficiently presents multiple challenges. The primary challenge is that the SA-IS algorithm exhibits random memory access patterns, which can significantly decrease efficiency due to cache misses. Another significant challenge is that the SA-IS algorithm is not a lightweight construction algorithm; it requires additional memory to support positions classification, induced sorting, compacted string representations, and recursive decomposition. To circumvent this, the libsais implements careful optimizations that are worth highlighting: * The libsais is meticulously designed from the ground up to leverage the capabilities of modern microprocessors, aiming to minimize various stalls and enhance throughput through instruction-level parallelism. The library employs sophisticated techniques such as manual loop unrolling, software prefetching, and branch elimination to achieve this goal. Moreover, it strives to minimize the number of passes over the data by combining multiple operations into a single function. A prime example of these techniques could be observed in the initialization phase of the SA-IS algorithm. In this phase, the entire logic required to classify positions, count symbols into various buckets, and segment the string into LMS substrings is executed through a single, completely branch-less loop: ```c for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } ``` * To sort LMS substrings lexicographically and compute their ranks, the libsais algorithm begins by gathering LMS-type positions as they appear in the string, placing them at the end of the suffix array. The library then employs two passes of induced sorting, which concludes with these same LMS-type positions ordered lexicographically at the beginning of the suffix array. Once all LMS-type positions are sorted, the ranks of the LMS substrings are computed by inspecting each pair of adjacent positions to determine if the corresponding LMS substrings are identical. If they are the same, they receive the same rank; otherwise, the rank is incremented by one. * The first challenge of induced sorting is that, during passes over the suffix array, we need to examine each value to determine if it represents a valid position or an empty space, whether this position is not the beginning of the string (and thus could induce another position), and if the induced position is going to be of the necessary type (for example, during a left-to-right scan, we are only inducing L-type positions). This process can cause branch mispredictions and corresponding microprocessor stalls. To address this challenge, libsais employs following techniques. Firstly, the library uses two pointers per induction bucket, each pointing to different sections of the suffix array depending on the type of positions these positions will be inducing next. This approach allows for the separation of LS-type (meaning S-type, which induces L-type; this is the same as LMS-type) and LL-type positions needed for the left-to-right scan from SL-type and SS-type positions needed for the right-to-left scan. Secondly, by understanding the distribution of symbols based on their position types and the types they induce (i.e., SS, SL, LS, LL), we can pre-calculate pointers for each bucket, leaving no empty spaces. And thirdly, by removing the first LMS position and all positions left of it from the initial gathering and distribution, we eliminate the need to check whether a position is not the beginning of the string. These techniques not only result in a completely branch-less loop for each induction sorting pass but also eliminate redundant scanning and the final gathering of LMS-type positions at the beginning of the suffix array. * The second challenge arises after induced sorting when we need to compute the ranks of LMS substrings. To accomplish this, we must first calculate and store the lengths of LMS substrings and then inspect each pair of adjacent LMS-type positions to determine if the corresponding LMS substrings are identical. This comparison starts with their lengths, and if they are the same, proceeds to compare the substrings themselves. Such operations exhibits random memory access patterns, which can significantly decrease efficiency due to cache misses. However, libsais avoids this inefficient logic by incorporating the ranking of LMS substrings as part of the induced sorting process itself. The library achieves this by marking the most significant bit (MSB) of positions in the suffix array that start new ranking groups. Each time a position is processed during induced sorting, the library checks the MSB and increments the current rank if the beginning of a new ranking group is encountered. Additionally, for each pointer in an induction bucket, the rank of the previous induced position is maintained. Whenever another position is induced, this previous rank is used to determine whether to mark the newly induced position as the beginning of a new rank group. All the logic to update the ranks and mark the beginnings of new ranking groups is implemented using bit manipulation and is completely branch-less. ```c for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais_prefetchr(&SA[i + 3 * prefetch_distance]); libsais_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2); libsais_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais_prefetchw(&buckets[v0]); sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais_prefetchw(&buckets[v1]); sa_sint_t p2 = SA[i + 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]); SA[buckets[v2]++] = (p2 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d; sa_sint_t p3 = SA[i + 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]); SA[buckets[v3]++] = (p3 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d; } ``` * In the SA-IS algorithm, after induced sorting, the ranks of LMS substrings are computed in suffix order. These ranks then need to be scattered to reorder them in string order before being gathered again to form the compacted string for recursion. At this point, some LMS substrings may be unique, meaning they don't share their rank with any other LMS substring. Being unique, these substrings are essentially already sorted, and their position relative to other LMS substrings is already determined. However, these unique LMS substrings may still be necessary for sorting other, non-unique LMS substrings during recursion-unless a unique LMS substring is immediately followed by another unique LMS substring in the string. In such cases, the rank of any subsequent unique LMS substrings becomes redundant in the compacted string, as it will not be utilized. Leveraging this insight, libsais employs a strategy to further reduce the size of the compacted string by omitting such redundant LMS substring ranks. This process involves a few steps. First, unique LMS substrings are identified by looking ahead while scanning LMS-positions in the suffix array during the ranking and scattering phase. When scattering LMS substring ranks to form the compacted string, the most significant bit (MSB) of the rank is used to mark that this rank is unique. Next, as the library scans the ranks in string order and detects tandems of unique ranks using the MSB, it then recalculates the MSB for ranks which are redundant, thus markign them for removal from the compacted string. Subsequently, the libsais rescans the LMS-positions in suffix order to recompute the ranks, now focusing only on the ranks of the remaining LMS substrings. The library also uses MSB of first symbol of LMS substrings to mark that LMS substring is removed from the compacted string. Finally, the library builds the compacted string based on the newly recalculated ranks for the remaining LMS substrings, while also saving the final positions for the removed LMS substrings before proceeding with recursion. This reduction process not only further decreases the size of the compacted string but also reduces the alphabet size of the reduced string and creates additional free space in the suffix array, which can be utilized during recursion. * The SA-IS algorithm, while robust for suffix array construction, is not considered lightweight due to its need for additional memory for tasks such as position classification, induced sorting, the creation of compacted string representations, and recursive decomposition. To mitigate this, libsais optimizes memory usage by not storing position classifications and striving to reuse the memory space allocated for the suffix array for induced sorting, compacted string representations, and recursive decomposition processes. Since position classifications are not stored, the library recalculates them as needed, typically involving checks of adjacent symbols for a given position. Although this approach may seem straightforward, it introduces the challenge of random memory access. Nevertheless, libsais manages these accesses in a manner that either avoids unnecessary memory fetches or minimizes cache penalties. In situations where avoiding cache penalties is unfeasible, the library leverages the most significant bit (MSB) bits for computations, as branch mispredictions on modern microprocessors generally incur lower penalties than cache misses. Memory reuse for the suffix array, despite appearing straightforward, also presents hidden challenges related to implementation complexity. In certain cases, the available space in the suffix array may not suffice for the most optimal algorithm implementation mentioned above. Although such instances are rare, the library aims to deliver optimal performance without additional memory allocation by resorting to a less efficient variant of induced sorting. To accommodate various scenarios, libsais includes four distinct implementations tailored to different breakpoints based on alphabet size (denoted by 'k'): 6k, 4k, 2k, and 1k, with each implementation optimized to ensure performance efficiency. Extensive efforts have been dedicated to refining these implementations, including significant time invested in using various sanitizers to confirm the correctness of the algorithms. Ultimately, while there are specific inputs under which libsais might require additional memory-most of which tend to be synthetic tests designed specifically to challenge the SA-IS algorithm-such instances are relatively rare. In these exceptional cases, the library is designed to allocate only the minimum necessary amount of memory while still delivering the best possible performance. * The libsais library, initially was developed for constructing suffix arrays, but has broadened its scope to include the calculation of the longest common prefix (LCP) and both the forward and inverse Burrows-Wheeler Transform (BWT) with considerable efforts has been dedicated to refining these algorithms to ensure they deliver maximum performance and maintain the correctness. An illustrative example is the forward BWT, which performance is nearly identical to that of its suffix array construction which is achieved by integrating a modified version of the induced sorting implementation within the final stage of the SA-IS algorithm. Rather than inducing suffix positions at this stage, the library induces the Burrows-Wheeler Transform directly. This approach also supports in-place transformation, maintaining a memory usage of 5n, making it an sutable for data compression applications. Similarly, the inverse BWT is fine-tuned to operate in-place, adhering to the same memory efficiency of 5n with an additional optimization of a bi-gram LF-mapping technique, which allows for the decoding of two symbols simultaneously effectively reduces the number of cache misses during the inversion of the Burrows-Wheeler Transform. # Benchmarks Full list of benchmarks are moved to own [Benchmarks.md](Benchmarks.md) file. ================================================ FILE: VERSION ================================================ 2.10.4 ================================================ FILE: include/libsais.h ================================================ /*-- This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2025 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Please see the file LICENSE for full copyright information. --*/ #ifndef LIBSAIS_H #define LIBSAIS_H 1 #define LIBSAIS_VERSION_MAJOR 2 #define LIBSAIS_VERSION_MINOR 10 #define LIBSAIS_VERSION_PATCH 4 #define LIBSAIS_VERSION_STRING "2.10.4" #ifdef _WIN32 #ifdef LIBSAIS_SHARED #ifdef LIBSAIS_EXPORTS #define LIBSAIS_API __declspec(dllexport) #else #define LIBSAIS_API __declspec(dllimport) #endif #else #define LIBSAIS_API #endif #else #define LIBSAIS_API #endif #ifdef __cplusplus extern "C" { #endif #include /** * Creates the libsais context that allows reusing allocated memory with each libsais operation. * In multi-threaded environments, use one context per thread for parallel executions. * @return the libsais context, NULL otherwise. */ LIBSAIS_API void * libsais_create_ctx(void); #if defined(LIBSAIS_OPENMP) /** * Creates the libsais context that allows reusing allocated memory with each parallel libsais operation using OpenMP. * In multi-threaded environments, use one context per thread for parallel executions. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return the libsais context, NULL otherwise. */ LIBSAIS_API void * libsais_create_ctx_omp(int32_t threads); #endif /** * Destroys the libsass context and free previusly allocated memory. * @param ctx The libsais context (can be NULL). */ LIBSAIS_API void libsais_free_ctx(void * ctx); /** * Constructs the suffix array of a given string. * @param T [0..n-1] The input string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the generalized suffix array (GSA) of given string set. * @param T [0..n-1] The input string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string set. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_gsa(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the suffix array of a given integer array. * Note, during construction input array will be modified, but restored at the end if no errors occurred. * @param T [0..n-1] The input integer array. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the integer array. * @param k The alphabet size of the input integer array. * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_int(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs); /** * Constructs the suffix array of a given string using libsais context. * @param ctx The libsais context. * @param T [0..n-1] The input string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the generalized suffix array (GSA) of given string set using libsais context. * @param ctx The libsais context. * @param T [0..n-1] The input string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string set. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_gsa_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); #if defined(LIBSAIS_OPENMP) /** * Constructs the suffix array of a given string in parallel using OpenMP. * @param T [0..n-1] The input string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads); /** * Constructs the generalized suffix array (GSA) of given string set in parallel using OpenMP. * @param T [0..n-1] The input string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string set. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_gsa_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads); /** * Constructs the suffix array of a given integer array in parallel using OpenMP. * Note, during construction input array will be modified, but restored at the end if no errors occurred. * @param T [0..n-1] The input integer array. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the integer array. * @param k The alphabet size of the input integer array. * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads); #endif /** * Constructs the burrows-wheeler transformed string (BWT) of a given string. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return The primary index if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The output auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); /** * Constructs the burrows-wheeler transformed string (BWT) of a given string using libsais context. * @param ctx The libsais context. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return The primary index if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes using libsais context. * @param ctx The libsais context. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The output auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); #if defined(LIBSAIS_OPENMP) /** * Constructs the burrows-wheeler transformed string (BWT) of a given string in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return The primary index if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads); /** * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The output auxiliary indexes. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads); #endif /** * Creates the libsais reverse BWT context that allows reusing allocated memory with each libsais_unbwt_* operation. * In multi-threaded environments, use one context per thread for parallel executions. * @return the libsais context, NULL otherwise. */ LIBSAIS_API void * libsais_unbwt_create_ctx(void); #if defined(LIBSAIS_OPENMP) /** * Creates the libsais reverse BWT context that allows reusing allocated memory with each parallel libsais_unbwt_* operation using OpenMP. * In multi-threaded environments, use one context per thread for parallel executions. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return the libsais context, NULL otherwise. */ LIBSAIS_API void * libsais_unbwt_create_ctx_omp(int32_t threads); #endif /** * Destroys the libsass reverse BWT context and free previusly allocated memory. * @param ctx The libsais context (can be NULL). */ LIBSAIS_API void libsais_unbwt_free_ctx(void * ctx); /** * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given string. * @param freq [0..255] The input symbol frequency table (can be NULL). * @param i The primary index. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); /** * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index using libsais reverse BWT context. * @param ctx The libsais reverse BWT context. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given string. * @param freq [0..255] The input symbol frequency table (can be NULL). * @param i The primary index. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); /** * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given string. * @param freq [0..255] The input symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The input auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); /** * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes using libsais reverse BWT context. * @param ctx The libsais reverse BWT context. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given string. * @param freq [0..255] The input symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The input auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); #if defined(LIBSAIS_OPENMP) /** * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given string. * @param freq [0..255] The input symbol frequency table (can be NULL). * @param i The primary index. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads); /** * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given string. * @param freq [0..255] The input symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The input auxiliary indexes. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS_API int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads); #endif /** * Constructs the permuted longest common prefix array (PLCP) of a given string and a suffix array. * @param T [0..n-1] The input string. * @param SA [0..n-1] The input suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the string and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS_API int32_t libsais_plcp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n); /** * Constructs the permuted longest common prefix array (PLCP) of a given string set and a generalized suffix array (GSA). * @param T [0..n-1] The input string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1] The input generalized suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the string set and the generalized suffix array. * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS_API int32_t libsais_plcp_gsa(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n); /** * Constructs the permuted longest common prefix array (PLCP) of a integer array and a suffix array. * @param T [0..n-1] The input integer array. * @param SA [0..n-1] The input suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the integer array and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS_API int32_t libsais_plcp_int(const int32_t * T, const int32_t * SA, int32_t * PLCP, int32_t n); /** * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array. * @param PLCP [0..n-1] The input permuted longest common prefix array. * @param SA [0..n-1] The input suffix array or generalized suffix array (GSA). * @param LCP [0..n-1] The output longest common prefix array (can be SA). * @param n The length of the permuted longest common prefix array and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS_API int32_t libsais_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n); #if defined(LIBSAIS_OPENMP) /** * Constructs the permuted longest common prefix array (PLCP) of a given string and a suffix array in parallel using OpenMP. * @param T [0..n-1] The input string. * @param SA [0..n-1] The input suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the string and the suffix array. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS_API int32_t libsais_plcp_omp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads); /** * Constructs the permuted longest common prefix array (PLCP) of a given string set and a generalized suffix array (GSA) in parallel using OpenMP. * @param T [0..n-1] The input string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1] The input generalized suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the string set and the generalized suffix array. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS_API int32_t libsais_plcp_gsa_omp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads); /** * Constructs the permuted longest common prefix array (PLCP) of a given integer array and a suffix array in parallel using OpenMP. * @param T [0..n-1] The input integer array. * @param SA [0..n-1] The input suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the integer array and the suffix array. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS_API int32_t libsais_plcp_int_omp(const int32_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads); /** * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array in parallel using OpenMP. * @param PLCP [0..n-1] The input permuted longest common prefix array. * @param SA [0..n-1] The input suffix array or generalized suffix array (GSA). * @param LCP [0..n-1] The output longest common prefix array (can be SA). * @param n The length of the permuted longest common prefix array and the suffix array. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS_API int32_t libsais_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n, int32_t threads); #endif #ifdef __cplusplus } #endif #endif ================================================ FILE: include/libsais16.h ================================================ /*-- This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2025 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Please see the file LICENSE for full copyright information. --*/ #ifndef LIBSAIS16_H #define LIBSAIS16_H 1 #define LIBSAIS16_VERSION_MAJOR 2 #define LIBSAIS16_VERSION_MINOR 10 #define LIBSAIS16_VERSION_PATCH 4 #define LIBSAIS16_VERSION_STRING "2.10.4" #ifdef _WIN32 #ifdef LIBSAIS_SHARED #ifdef LIBSAIS_EXPORTS #define LIBSAIS16_API __declspec(dllexport) #else #define LIBSAIS16_API __declspec(dllimport) #endif #else #define LIBSAIS16_API #endif #else #define LIBSAIS16_API #endif #ifdef __cplusplus extern "C" { #endif #include /** * Creates the libsais16 context that allows reusing allocated memory with each libsais16 operation. * In multi-threaded environments, use one context per thread for parallel executions. * @return the libsais16 context, NULL otherwise. */ LIBSAIS16_API void * libsais16_create_ctx(void); #if defined(LIBSAIS_OPENMP) /** * Creates the libsais16 context that allows reusing allocated memory with each parallel libsais16 operation using OpenMP. * In multi-threaded environments, use one context per thread for parallel executions. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return the libsais16 context, NULL otherwise. */ LIBSAIS16_API void * libsais16_create_ctx_omp(int32_t threads); #endif /** * Destroys the libsass context and free previusly allocated memory. * @param ctx The libsais16 context (can be NULL). */ LIBSAIS16_API void libsais16_free_ctx(void * ctx); /** * Constructs the suffix array of a given 16-bit string. * @param T [0..n-1] The input 16-bit string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the generalized suffix array (GSA) of given 16-bit string set. * @param T [0..n-1] The input 16-bit string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given 16-bit string set. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_gsa(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the suffix array of a given integer array. * Note, during construction input array will be modified, but restored at the end if no errors occurred. * @param T [0..n-1] The input integer array. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the integer array. * @param k The alphabet size of the input integer array. * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_int(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs); /** * Constructs the suffix array of a given 16-bit string using libsais16 context. * @param ctx The libsais16 context. * @param T [0..n-1] The input 16-bit string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_ctx(const void * ctx, const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the generalized suffix array (GSA) of given 16-bit string set using libsais16 context. * @param ctx The libsais16 context. * @param T [0..n-1] The input 16-bit string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given 16-bit string set. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_gsa_ctx(const void * ctx, const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq); #if defined(LIBSAIS_OPENMP) /** * Constructs the suffix array of a given 16-bit string in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads); /** * Constructs the generalized suffix array (GSA) of given 16-bit string set in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given 16-bit string set. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_gsa_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads); /** * Constructs the suffix array of a given integer array in parallel using OpenMP. * Note, during construction input array will be modified, but restored at the end if no errors occurred. * @param T [0..n-1] The input integer array. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the integer array. * @param k The alphabet size of the input integer array. * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads); #endif /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @return The primary index if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_bwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string with auxiliary indexes. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The output auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_bwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string using libsais16 context. * @param ctx The libsais16 context. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @return The primary index if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_bwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq); /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string with auxiliary indexes using libsais16 context. * @param ctx The libsais16 context. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The output auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_bwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I); #if defined(LIBSAIS_OPENMP) /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return The primary index if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_bwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads); /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The output auxiliary indexes. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_bwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads); #endif /** * Creates the libsais16 reverse BWT context that allows reusing allocated memory with each libsais16_unbwt_* operation. * In multi-threaded environments, use one context per thread for parallel executions. * @return the libsais16 context, NULL otherwise. */ LIBSAIS16_API void * libsais16_unbwt_create_ctx(void); #if defined(LIBSAIS_OPENMP) /** * Creates the libsais16 reverse BWT context that allows reusing allocated memory with each parallel libsais16_unbwt_* operation using OpenMP. * In multi-threaded environments, use one context per thread for parallel executions. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return the libsais16 context, NULL otherwise. */ LIBSAIS16_API void * libsais16_unbwt_create_ctx_omp(int32_t threads); #endif /** * Destroys the libsass reverse BWT context and free previusly allocated memory. * @param ctx The libsais16 context (can be NULL). */ LIBSAIS16_API void libsais16_unbwt_free_ctx(void * ctx); /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with primary index. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given 16-bit string. * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). * @param i The primary index. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_unbwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with primary index using libsais16 reverse BWT context. * @param ctx The libsais16 reverse BWT context. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given 16-bit string. * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). * @param i The primary index. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_unbwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i); /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with auxiliary indexes. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given 16-bit string. * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The input auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_unbwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with auxiliary indexes using libsais16 reverse BWT context. * @param ctx The libsais16 reverse BWT context. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given 16-bit string. * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The input auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_unbwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I); #if defined(LIBSAIS_OPENMP) /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with primary index in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given 16-bit string. * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). * @param i The primary index. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_unbwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads); /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given 16-bit string. * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The input auxiliary indexes. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16_API int32_t libsais16_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads); #endif /** * Constructs the permuted longest common prefix array (PLCP) of a given 16-bit string and a suffix array. * @param T [0..n-1] The input 16-bit string. * @param SA [0..n-1] The input suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the 16-bit string and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS16_API int32_t libsais16_plcp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n); /** * Constructs the permuted longest common prefix array (PLCP) of a given 16-bit string set and a generalized suffix array (GSA). * @param T [0..n-1] The input 16-bit string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1] The input generalized suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the string set and the generalized suffix array. * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS16_API int32_t libsais16_plcp_gsa(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n); /** * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array. * @param PLCP [0..n-1] The input permuted longest common prefix array. * @param SA [0..n-1] The input suffix array or generalized suffix array (GSA). * @param LCP [0..n-1] The output longest common prefix array (can be SA). * @param n The length of the permuted longest common prefix array and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS16_API int32_t libsais16_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n); #if defined(LIBSAIS_OPENMP) /** * Constructs the permuted longest common prefix array (PLCP) of a given 16-bit string and a suffix array in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param SA [0..n-1] The input suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the 16-bit string and the suffix array. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS16_API int32_t libsais16_plcp_omp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads); /** * Constructs the permuted longest common prefix array (PLCP) of a given 16-bit string set and a generalized suffix array (GSA) in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1] The input generalized suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the string set and the generalized suffix array. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS16_API int32_t libsais16_plcp_gsa_omp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads); /** * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array in parallel using OpenMP. * @param PLCP [0..n-1] The input permuted longest common prefix array. * @param SA [0..n-1] The input suffix array or generalized suffix array (GSA). * @param LCP [0..n-1] The output longest common prefix array (can be SA). * @param n The length of the permuted longest common prefix array and the suffix array. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS16_API int32_t libsais16_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n, int32_t threads); #endif #ifdef __cplusplus } #endif #endif ================================================ FILE: include/libsais16x64.h ================================================ /*-- This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2025 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Please see the file LICENSE for full copyright information. --*/ #ifndef LIBSAIS16X64_H #define LIBSAIS16X64_H 1 #define LIBSAIS16X64_VERSION_MAJOR 2 #define LIBSAIS16X64_VERSION_MINOR 10 #define LIBSAIS16X64_VERSION_PATCH 4 #define LIBSAIS16X64_VERSION_STRING "2.10.4" #ifdef _WIN32 #ifdef LIBSAIS_SHARED #ifdef LIBSAIS_EXPORTS #define LIBSAIS16X64_API __declspec(dllexport) #else #define LIBSAIS16X64_API __declspec(dllimport) #endif #else #define LIBSAIS16X64_API #endif #else #define LIBSAIS16X64_API #endif #ifdef __cplusplus extern "C" { #endif #include /** * Constructs the suffix array of a given 16-bit string. * @param T [0..n-1] The input 16-bit string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); /** * Constructs the generalized suffix array (GSA) of a given 16-bit string set. * @param T [0..n-1] The input 16-bit string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given 16-bit string set. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_gsa(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); /** * Constructs the suffix array of a given integer array. * Note, during construction input array will be modified, but restored at the end if no errors occurred. * @param T [0..n-1] The input integer array. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the integer array. * @param k The alphabet size of the input integer array. * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_long(int64_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs); #if defined(LIBSAIS_OPENMP) /** * Constructs the suffix array of a given 16-bit string in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_omp(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq, int64_t threads); /** * Constructs the generalized suffix array (GSA) of a given 16-bit string set in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given 16-bit string set. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_gsa_omp(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq, int64_t threads); /** * Constructs the suffix array of a given integer array in parallel using OpenMP. * Note, during construction input array will be modified, but restored at the end if no errors occurred. * @param T [0..n-1] The input integer array. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the integer array. * @param k The alphabet size of the input integer array. * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_long_omp(int64_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs, int64_t threads); #endif /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @return The primary index if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_bwt(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq); /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string with auxiliary indexes. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The output auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_bwt_aux(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I); #if defined(LIBSAIS_OPENMP) /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return The primary index if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_bwt_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t threads); /** * Constructs the burrows-wheeler transformed 16-bit string (BWT) of a given 16-bit string with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given 16-bit string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..65535] The output 16-bit symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The output auxiliary indexes. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_bwt_aux_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I, int64_t threads); #endif /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with primary index. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given 16-bit string. * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). * @param i The primary index. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_unbwt(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i); /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with auxiliary indexes. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given 16-bit string. * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The input auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_unbwt_aux(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I); #if defined(LIBSAIS_OPENMP) /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with primary index in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given 16-bit string. * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). * @param i The primary index. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_unbwt_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i, int64_t threads); /** * Constructs the original 16-bit string from a given burrows-wheeler transformed 16-bit string (BWT) with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param U [0..n-1] The output 16-bit string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given 16-bit string. * @param freq [0..65535] The input 16-bit symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The input auxiliary indexes. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I, int64_t threads); #endif /** * Constructs the permuted longest common prefix array (PLCP) of a given 16-bit string and a suffix array. * @param T [0..n-1] The input 16-bit string. * @param SA [0..n-1] The input suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the 16-bit string and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_plcp(const uint16_t * T, const int64_t * SA, int64_t * PLCP, int64_t n); /** * Constructs the permuted longest common prefix array (PLCP) of a given 16-bit string set and a generalized suffix array (GSA). * @param T [0..n-1] The input 16-bit string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1] The input generalized suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the string set and the generalized suffix array. * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_plcp_gsa(const uint16_t * T, const int64_t * SA, int64_t * PLCP, int64_t n); /** * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array. * @param PLCP [0..n-1] The input permuted longest common prefix array. * @param SA [0..n-1] The input suffix array or generalized suffix array (GSA). * @param LCP [0..n-1] The output longest common prefix array (can be SA). * @param n The length of the permuted longest common prefix array and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_lcp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n); #if defined(LIBSAIS_OPENMP) /** * Constructs the permuted longest common prefix array (PLCP) of a given 16-bit string and a suffix array in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string. * @param SA [0..n-1] The input suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the 16-bit string and the suffix array. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_plcp_omp(const uint16_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads); /** * Constructs the permuted longest common prefix array (PLCP) of a given 16-bit string set and a generalized suffix array (GSA) in parallel using OpenMP. * @param T [0..n-1] The input 16-bit string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1] The input generalized suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the string set and the generalized suffix array. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_plcp_gsa_omp(const uint16_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads); /** * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array in parallel using OpenMP. * @param PLCP [0..n-1] The input permuted longest common prefix array. * @param SA [0..n-1] The input suffix array or generalized suffix array (GSA). * @param LCP [0..n-1] The output longest common prefix array (can be SA). * @param n The length of the permuted longest common prefix array and the suffix array. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS16X64_API int64_t libsais16x64_lcp_omp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n, int64_t threads); #endif #ifdef __cplusplus } #endif #endif ================================================ FILE: include/libsais64.h ================================================ /*-- This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2025 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Please see the file LICENSE for full copyright information. --*/ #ifndef LIBSAIS64_H #define LIBSAIS64_H 1 #define LIBSAIS64_VERSION_MAJOR 2 #define LIBSAIS64_VERSION_MINOR 10 #define LIBSAIS64_VERSION_PATCH 4 #define LIBSAIS64_VERSION_STRING "2.10.4" #ifdef _WIN32 #ifdef LIBSAIS_SHARED #ifdef LIBSAIS_EXPORTS #define LIBSAIS64_API __declspec(dllexport) #else #define LIBSAIS64_API __declspec(dllimport) #endif #else #define LIBSAIS64_API #endif #else #define LIBSAIS64_API #endif #ifdef __cplusplus extern "C" { #endif #include /** * Constructs the suffix array of a given string. * @param T [0..n-1] The input string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); /** * Constructs the generalized suffix array (GSA) of given string set. * @param T [0..n-1] The input string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string set. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64_gsa(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq); /** * Constructs the suffix array of a given integer array. * Note, during construction input array will be modified, but restored at the end if no errors occurred. * @param T [0..n-1] The input integer array. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the integer array. * @param k The alphabet size of the input integer array. * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64_long(int64_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs); #if defined(LIBSAIS_OPENMP) /** * Constructs the suffix array of a given string in parallel using OpenMP. * @param T [0..n-1] The input string. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64_omp(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq, int64_t threads); /** * Constructs the generalized suffix array (GSA) of given string set in parallel using OpenMP. * @param T [0..n-1] The input string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the given string set. * @param fs The extra space available at the end of SA array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64_gsa_omp(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq, int64_t threads); /** * Constructs the suffix array of a given integer array in parallel using OpenMP. * Note, during construction input array will be modified, but restored at the end if no errors occurred. * @param T [0..n-1] The input integer array. * @param SA [0..n-1+fs] The output array of suffixes. * @param n The length of the integer array. * @param k The alphabet size of the input integer array. * @param fs Extra space available at the end of SA array (can be 0, but 4k or better 6k is recommended for optimal performance). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64_long_omp(int64_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs, int64_t threads); #endif /** * Constructs the burrows-wheeler transformed string (BWT) of a given string. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @return The primary index if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64_bwt(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq); /** * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The output auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64_bwt_aux(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I); #if defined(LIBSAIS_OPENMP) /** * Constructs the burrows-wheeler transformed string (BWT) of a given string in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return The primary index if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64_bwt_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t threads); /** * Constructs the burrows-wheeler transformed string (BWT) of a given string with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n-1+fs] The temporary array. * @param n The length of the given string. * @param fs The extra space available at the end of A array (0 should be enough for most cases). * @param freq [0..255] The output symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The output auxiliary indexes. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64_bwt_aux_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I, int64_t threads); #endif /** * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given string. * @param freq [0..255] The input symbol frequency table (can be NULL). * @param i The primary index. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64_unbwt(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i); /** * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given string. * @param freq [0..255] The input symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The input auxiliary indexes. * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64_unbwt_aux(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I); #if defined(LIBSAIS_OPENMP) /** * Constructs the original string from a given burrows-wheeler transformed string (BWT) with primary index in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given string. * @param freq [0..255] The input symbol frequency table (can be NULL). * @param i The primary index. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64_unbwt_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i, int64_t threads); /** * Constructs the original string from a given burrows-wheeler transformed string (BWT) with auxiliary indexes in parallel using OpenMP. * @param T [0..n-1] The input string. * @param U [0..n-1] The output string (can be T). * @param A [0..n] The temporary array (NOTE, temporary array must be n + 1 size). * @param n The length of the given string. * @param freq [0..255] The input symbol frequency table (can be NULL). * @param r The sampling rate for auxiliary indexes (must be power of 2). * @param I [0..(n-1)/r] The input auxiliary indexes. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 or -2 otherwise. */ LIBSAIS64_API int64_t libsais64_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I, int64_t threads); #endif /** * Constructs the permuted longest common prefix array (PLCP) of a given string and a suffix array. * @param T [0..n-1] The input string. * @param SA [0..n-1] The input suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the string and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS64_API int64_t libsais64_plcp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n); /** * Constructs the permuted longest common prefix array (PLCP) of a given string set and a generalized suffix array (GSA). * @param T [0..n-1] The input string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1] The input generalized suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the string set and the generalized suffix array. * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS64_API int64_t libsais64_plcp_gsa(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n); /** * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array. * @param PLCP [0..n-1] The input permuted longest common prefix array. * @param SA [0..n-1] The input suffix array or generalized suffix array (GSA). * @param LCP [0..n-1] The output longest common prefix array (can be SA). * @param n The length of the permuted longest common prefix array and the suffix array. * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS64_API int64_t libsais64_lcp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n); #if defined(LIBSAIS_OPENMP) /** * Constructs the permuted longest common prefix array (PLCP) of a given string and a suffix array in parallel using OpenMP. * @param T [0..n-1] The input string. * @param SA [0..n-1] The input suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the string and the suffix array. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS64_API int64_t libsais64_plcp_omp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads); /** * Constructs the permuted longest common prefix array (PLCP) of a given string set and a generalized suffix array (GSA) in parallel using OpenMP. * @param T [0..n-1] The input string set using 0 as separators (T[n-1] must be 0). * @param SA [0..n-1] The input generalized suffix array. * @param PLCP [0..n-1] The output permuted longest common prefix array. * @param n The length of the string set and the generalized suffix array. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS64_API int64_t libsais64_plcp_gsa_omp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads); /** * Constructs the longest common prefix array (LCP) of a given permuted longest common prefix array (PLCP) and a suffix array in parallel using OpenMP. * @param PLCP [0..n-1] The input permuted longest common prefix array. * @param SA [0..n-1] The input suffix array or generalized suffix array (GSA). * @param LCP [0..n-1] The output longest common prefix array (can be SA). * @param n The length of the permuted longest common prefix array and the suffix array. * @param threads The number of OpenMP threads to use (can be 0 for OpenMP default). * @return 0 if no error occurred, -1 otherwise. */ LIBSAIS64_API int64_t libsais64_lcp_omp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n, int64_t threads); #endif #ifdef __cplusplus } #endif #endif ================================================ FILE: src/libsais.c ================================================ /*-- This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2025 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Please see the file LICENSE for full copyright information. --*/ #include "libsais.h" #include #include #include #include #include #if defined(LIBSAIS_OPENMP) #include #else #define UNUSED(_x) (void)(_x) #endif typedef int32_t sa_sint_t; typedef uint32_t sa_uint_t; typedef ptrdiff_t fast_sint_t; typedef size_t fast_uint_t; #define SAINT_BIT (32) #define SAINT_MAX INT32_MAX #define SAINT_MIN INT32_MIN #define ALPHABET_SIZE (1 << CHAR_BIT) #define UNBWT_FASTBITS (17) #define SUFFIX_GROUP_BIT (SAINT_BIT - 1) #define SUFFIX_GROUP_MARKER (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1)) #define BUCKETS_INDEX2(_c, _s) (((_c) << 1) + (_s)) #define BUCKETS_INDEX4(_c, _s) (((_c) << 2) + (_s)) #define LIBSAIS_LOCAL_BUFFER_SIZE (2000) #define LIBSAIS_PER_THREAD_CACHE_SIZE (24576) #define LIBSAIS_FLAGS_NONE (0) #define LIBSAIS_FLAGS_BWT (1) #define LIBSAIS_FLAGS_GSA (2) typedef struct LIBSAIS_THREAD_CACHE { sa_sint_t symbol; sa_sint_t index; } LIBSAIS_THREAD_CACHE; typedef union LIBSAIS_THREAD_STATE { struct { fast_sint_t position; fast_sint_t count; fast_sint_t m; fast_sint_t last_lms_suffix; sa_sint_t * buckets; LIBSAIS_THREAD_CACHE * cache; } state; uint8_t padding[64]; } LIBSAIS_THREAD_STATE; typedef struct LIBSAIS_CONTEXT { sa_sint_t * buckets; LIBSAIS_THREAD_STATE * thread_state; fast_sint_t threads; } LIBSAIS_CONTEXT; typedef struct LIBSAIS_UNBWT_CONTEXT { sa_uint_t * bucket2; uint16_t * fastbits; sa_uint_t * buckets; fast_sint_t threads; } LIBSAIS_UNBWT_CONTEXT; #if defined(__GNUC__) || defined(__clang__) #define RESTRICT __restrict__ #elif defined(_MSC_VER) || defined(__INTEL_COMPILER) #define RESTRICT __restrict #else #error Your compiler, configuration or platform is not supported. #endif #if defined(__has_builtin) #if __has_builtin(__builtin_prefetch) #define HAS_BUILTIN_PREFETCH #endif #elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4)) #define HAS_BUILTIN_PREFETCH #endif #if defined(__has_builtin) #if __has_builtin(__builtin_bswap16) #define HAS_BUILTIN_BSWAP16 #endif #elif defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5)) #define HAS_BUILTIN_BSWAP16 #endif #if defined(HAS_BUILTIN_PREFETCH) #define libsais_prefetchr(address) __builtin_prefetch((const void *)(address), 0, 3) #define libsais_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 3) #elif defined (_M_IX86) || defined (_M_AMD64) #include #define libsais_prefetchr(address) _mm_prefetch((const void *)(address), _MM_HINT_T0) #define libsais_prefetchw(address) _m_prefetchw((const void *)(address)) #elif defined (_M_ARM) #include #define libsais_prefetchr(address) __prefetch((const void *)(address)) #define libsais_prefetchw(address) __prefetchw((const void *)(address)) #elif defined (_M_ARM64) #include #define libsais_prefetchr(address) __prefetch2((const void *)(address), 0) #define libsais_prefetchw(address) __prefetch2((const void *)(address), 16) #else #error Your compiler, configuration or platform is not supported. #endif #if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) #if defined(_LITTLE_ENDIAN) \ || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) \ || (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) \ || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) \ || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) #define __LITTLE_ENDIAN__ #elif defined(_BIG_ENDIAN) \ || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) \ || (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) \ || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) \ || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define __BIG_ENDIAN__ #elif defined(_WIN32) #define __LITTLE_ENDIAN__ #endif #endif #if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) #if defined(HAS_BUILTIN_BSWAP16) #define libsais_bswap16(x) (__builtin_bswap16(x)) #elif defined(_MSC_VER) && !defined(__INTEL_COMPILER) #define libsais_bswap16(x) (_byteswap_ushort(x)) #else #define libsais_bswap16(x) ((uint16_t)(x >> 8) | (uint16_t)(x << 8)) #endif #elif !defined(__LITTLE_ENDIAN__) && defined(__BIG_ENDIAN__) #define libsais_bswap16(x) (x) #else #error Your compiler, configuration or platform is not supported. #endif static void * libsais_align_up(const void * address, size_t alignment) { return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment))); } static void * libsais_alloc_aligned(size_t size, size_t alignment) { void * address = malloc(size + sizeof(short) + alignment - 1); if (address != NULL) { void * aligned_address = libsais_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment); ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address); return aligned_address; } return NULL; } static void libsais_free_aligned(void * aligned_address) { if (aligned_address != NULL) { free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1])); } } static LIBSAIS_THREAD_STATE * libsais_alloc_thread_state(sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = (LIBSAIS_THREAD_STATE *)libsais_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096); sa_sint_t * RESTRICT thread_buckets = (sa_sint_t *)libsais_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); LIBSAIS_THREAD_CACHE * RESTRICT thread_cache = (LIBSAIS_THREAD_CACHE *)libsais_alloc_aligned((size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096); if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL) { fast_sint_t t; for (t = 0; t < threads; ++t) { thread_state[t].state.buckets = thread_buckets; thread_buckets += 4 * ALPHABET_SIZE; thread_state[t].state.cache = thread_cache; thread_cache += LIBSAIS_PER_THREAD_CACHE_SIZE; } return thread_state; } libsais_free_aligned(thread_cache); libsais_free_aligned(thread_buckets); libsais_free_aligned(thread_state); return NULL; } static void libsais_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) { if (thread_state != NULL) { libsais_free_aligned(thread_state[0].state.cache); libsais_free_aligned(thread_state[0].state.buckets); libsais_free_aligned(thread_state); } } static LIBSAIS_CONTEXT * libsais_create_ctx_main(sa_sint_t threads) { LIBSAIS_CONTEXT * RESTRICT ctx = (LIBSAIS_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64); sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned((size_t)8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL; if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1)) { ctx->buckets = buckets; ctx->threads = threads; ctx->thread_state = thread_state; return ctx; } libsais_free_thread_state(thread_state); libsais_free_aligned(buckets); libsais_free_aligned(ctx); return NULL; } static void libsais_free_ctx_main(LIBSAIS_CONTEXT * ctx) { if (ctx != NULL) { libsais_free_thread_state(ctx->thread_state); libsais_free_aligned(ctx->buckets); libsais_free_aligned(ctx); } } #if defined(LIBSAIS_OPENMP) static sa_sint_t libsais_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { sa_sint_t count = 0; fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] < 0); } return count; } static sa_sint_t libsais_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { sa_sint_t count = 0; fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] == 0); } return count; } static void libsais_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais_prefetchr(&cache[i + 2 * prefetch_distance]); libsais_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]); libsais_prefetchw(&SA[cache[i + prefetch_distance + 1].symbol]); libsais_prefetchw(&SA[cache[i + prefetch_distance + 2].symbol]); libsais_prefetchw(&SA[cache[i + prefetch_distance + 3].symbol]); SA[cache[i + 0].symbol] = cache[i + 0].index; SA[cache[i + 1].symbol] = cache[i + 1].index; SA[cache[i + 2].symbol] = cache[i + 2].index; SA[cache[i + 3].symbol] = cache[i + 3].index; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[cache[i].symbol] = cache[i].index; } } static void libsais_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { libsais_prefetchw(&cache[i + prefetch_distance]); cache[l] = cache[i + 0]; l += cache[l].symbol >= 0; cache[l] = cache[i + 1]; l += cache[l].symbol >= 0; cache[l] = cache[i + 2]; l += cache[l].symbol >= 0; cache[l] = cache[i + 3]; l += cache[l].symbol >= 0; } for (j += 3; i < j; i += 1) { cache[l] = cache[i]; l += cache[l].symbol >= 0; } libsais_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start); } static void libsais_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s]; } } static void libsais_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s]; } } static void libsais_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s]; } } static void libsais_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s]; } } static void libsais_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s]; } } static void libsais_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s]; } } static void libsais_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s]; } } static void libsais_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride; sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s]; } } static void libsais_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size, fast_sint_t bucket_stride, fast_sint_t num_buckets) { while (num_buckets >= 9) { libsais_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size, bucket_stride); num_buckets -= 8; } switch (num_buckets) { case 1: break; case 2: libsais_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride); break; case 3: libsais_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride); break; case 4: libsais_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride); break; case 5: libsais_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride); break; case 6: libsais_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride); break; case 7: libsais_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride); break; case 8: libsais_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride); break; default: break; } } #endif static void libsais_flip_suffix_markers_omp(sa_sint_t * RESTRICT SA, sa_sint_t l, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && l >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (l / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : l - omp_block_start; fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { SA[i] ^= SAINT_MIN; } } } static void libsais_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 256; fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4) { libsais_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); } for (j -= 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); } SA[m] = (sa_sint_t)(i + 1); } } static void libsais_gather_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t > omp_thread_num; --t) { m += thread_state[t].state.m; } libsais_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start, omp_block_size); #pragma omp barrier if (thread_state[omp_thread_num].state.m > 0) { SA[(fast_sint_t)n - 1 - m] = (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix; } } #endif } } static sa_sint_t libsais_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; sa_sint_t i = n - 2; sa_sint_t m = n - 1; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= 3; i -= 4) { libsais_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i + 1; m -= (sa_sint_t)(f1 & ~f0); c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 0; m -= (sa_sint_t)(f0 & ~f1); c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i - 1; m -= (sa_sint_t)(f1 & ~f0); c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 2; m -= (sa_sint_t)(f0 & ~f1); } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i + 1; m -= (sa_sint_t)(f0 & ~f1); } return n - 1 - m; } static sa_sint_t libsais_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; sa_sint_t i = n - 2; sa_sint_t m = n - 1; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= 3; i -= 4) { libsais_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i + 1; m -= (sa_sint_t)(f1 & ~f0 & (c0 >= 0)); c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 0; m -= (sa_sint_t)(f0 & ~f1 & (c1 >= 0)); c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i - 1; m -= (sa_sint_t)(f1 & ~f0 & (c0 >= 0)); c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 2; m -= (sa_sint_t)(f0 & ~f1 & (c1 >= 0)); } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i + 1; m -= (sa_sint_t)(f0 & ~f1 & (c1 >= 0)); } return n - 1 - m; } #if defined(LIBSAIS_OPENMP) static void libsais_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t i = n - 2; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0)]++; } #endif static void libsais_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t i = n - 2; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++; } #if defined(LIBSAIS_OPENMP) static void libsais_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t i = n - 2; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++; } #endif static sa_sint_t libsais_count_and_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 256; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4) { libsais_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } for (j -= 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } static sa_sint_t libsais_count_and_gather_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.m = libsais_count_and_gather_lms_suffixes_8u(T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start, omp_block_size); if (thread_state[omp_thread_num].state.m > 0) { thread_state[omp_thread_num].state.last_lms_suffix = SA[thread_state[omp_thread_num].state.position - 1]; } } #pragma omp barrier #pragma omp master { memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { m += (sa_sint_t)thread_state[t].state.m; if (t != omp_num_threads - 1 && thread_state[t].state.m > 0) { memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.m], (size_t)thread_state[t].state.m * sizeof(sa_sint_t)); } { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t s; for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) { sa_sint_t A = buckets[s], B = temp_bucket[s]; buckets[s] = A + B; temp_bucket[s] = A; } } } } } #endif } return m; } static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]); libsais_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0 & (c0 >= 0)); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1 & (c1 >= 0)); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0 & (c0 >= 0)); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1 & (c1 >= 0)); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1 & (c1 >= 0)); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0 & (c0 >= 0)); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } #if defined(LIBSAIS_OPENMP) static fast_sint_t libsais_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size, fast_sint_t num_buckets) { fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024); if (free_space / (num_buckets - 1) >= bucket_size_1024) { return bucket_size_1024; } fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16); if (free_space / (num_buckets - 1) >= bucket_size_16) { return bucket_size_16; } return bucket_size; } static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(local_buckets); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t bucket_size = 4 * (fast_sint_t)k; fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); fast_sint_t bucket_stride = libsais_get_bucket_stride(free_space, bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); } #pragma omp barrier if (omp_thread_num == omp_num_threads - 1) { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { m += (sa_sint_t)thread_state[t].state.count; if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) { memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } } } else { omp_num_threads = omp_num_threads - 1; omp_block_stride = (bucket_size / omp_num_threads) & (-16); omp_block_start = omp_thread_num * omp_block_stride; omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1); } } #endif } return m; } static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(local_buckets); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t bucket_size = 2 * (fast_sint_t)k; fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); fast_sint_t bucket_stride = libsais_get_bucket_stride(free_space, bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); } #pragma omp barrier if (omp_thread_num == omp_num_threads - 1) { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { m += (sa_sint_t)thread_state[t].state.count; if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) { memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } } } else { omp_num_threads = omp_num_threads - 1; omp_block_stride = (bucket_size / omp_num_threads) & (-16); omp_block_start = omp_thread_num * omp_block_stride; omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1); } } #endif } return m; } static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(local_buckets); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t bucket_size = 2 * (fast_sint_t)k; fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[(fast_sint_t)n + (fast_sint_t)n]); fast_sint_t bucket_stride = libsais_get_bucket_stride(free_space, bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) { m += (sa_sint_t)thread_state[t].state.count; } if (thread_state[omp_thread_num].state.count > 0) { memcpy(&SA[n - m], &SA[n + thread_state[omp_thread_num].state.position - thread_state[omp_thread_num].state.count], (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t)); } } { omp_block_stride = (bucket_size / omp_num_threads) & (-16); omp_block_start = omp_thread_num * omp_block_stride; omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; libsais_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads); } } #endif } } #endif static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { m = libsais_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else if (omp_thread_num == 0) { libsais_count_lms_suffixes_32s_4k(T, n, k, buckets); } else { m = libsais_gather_lms_suffixes_32s(T, SA, n); } #endif } return m; } static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { m = libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else if (omp_thread_num == 0) { libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); } else { m = libsais_gather_lms_suffixes_32s(T, SA, n); } #endif } return m; } static sa_sint_t libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { m = libsais_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else if (omp_thread_num == 0) { libsais_count_compacted_lms_suffixes_32s_2k(T, n, k, buckets); } else { m = libsais_gather_compacted_lms_suffixes_32s(T, SA, n); } #endif } return m; } static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m; #if defined(LIBSAIS_OPENMP) fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); sa_sint_t max_threads = (sa_sint_t)(free_space / ((4 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 16 / k) { max_threads = n / 16 / k; } m = libsais_count_and_gather_lms_suffixes_32s_4k_fs_omp(T, SA, n, k, buckets, local_buckets, max_threads > 2 ? max_threads : 2, thread_state); } else #else UNUSED(local_buckets); UNUSED(thread_state); #endif { m = libsais_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads); } return m; } static sa_sint_t libsais_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m; #if defined(LIBSAIS_OPENMP) fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); sa_sint_t max_threads = (sa_sint_t)(free_space / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } m = libsais_count_and_gather_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, local_buckets, max_threads > 2 ? max_threads : 2, thread_state); } else #else UNUSED(local_buckets); UNUSED(thread_state); #endif { m = libsais_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); } return m; } static void libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[(fast_sint_t)n + (fast_sint_t)n]); sa_sint_t max_threads = (sa_sint_t)(free_space / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (!local_buckets && max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } libsais_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, local_buckets, max_threads > 2 ? max_threads : 2, thread_state); } else #else UNUSED(local_buckets); UNUSED(thread_state); #endif { libsais_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); } } static void libsais_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j; for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) { libsais_prefetchr(&T[i + prefetch_distance]); buckets[T[i + 0]]++; buckets[T[i + 1]]++; buckets[T[i + 2]]++; buckets[T[i + 3]]++; buckets[T[i + 4]]++; buckets[T[i + 5]]++; buckets[T[i + 6]]++; buckets[T[i + 7]]++; } for (j += 7; i < j; i += 1) { buckets[T[i]]++; } } static sa_sint_t libsais_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq) { sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE]; sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; fast_sint_t k = -1; if (freq != NULL) { fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { sa_sint_t total = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; bucket_start[j] = sum; sum += total; bucket_end[j] = sum; k = total > 0 ? j : k; freq[j] = total; } } else { fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { sa_sint_t total = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; bucket_start[j] = sum; sum += total; bucket_end[j] = sum; k = total > 0 ? j : k; } } return (sa_sint_t)(k + 1); } static void libsais_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { sa_sint_t * RESTRICT bucket_start = &buckets[4 * (fast_sint_t)k]; sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { bucket_start[j] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; bucket_end[j] = sum; } } static void libsais_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) { bucket_start[j] = sum; sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; bucket_end[j] = sum; } } static void libsais_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i; sa_sint_t sum0 = 0; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) { sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; } } static void libsais_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i, j; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) { buckets[j] = buckets[i]; } buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t)); } static void libsais_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i; sa_sint_t sum = 0; for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sa_sint_t tmp = buckets[i]; buckets[i] = sum; sum += tmp; } } static void libsais_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i; sa_sint_t sum = 0; for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; } } static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { { fast_uint_t f0 = 0; fast_uint_t f1 = 0; fast_sint_t c0 = T[first_lms_suffix]; fast_sint_t c1 = 0; for (; --first_lms_suffix >= 0; ) { c1 = c0; c0 = T[first_lms_suffix]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]--; } buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0)]--; } { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; } return sum; } } static void libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; fast_sint_t i; sa_sint_t sum0 = 0, sum1 = 0; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) { sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; sum1 += buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; buckets[i + BUCKETS_INDEX2(0, 1)] = sum1; } } static sa_sint_t libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { { fast_uint_t f0 = 0; fast_uint_t f1 = 0; fast_sint_t c0 = T[first_lms_suffix]; fast_sint_t c1 = 0; for (; --first_lms_suffix >= 0; ) { c1 = c0; c0 = T[first_lms_suffix]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]--; } buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0)]--; } { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; } return sum; } } static void libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; fast_sint_t i, j; sa_sint_t sum0 = 0, sum1 = 0; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) { bucket_start[j] = sum1; sum0 += buckets[i + BUCKETS_INDEX2(0, 1)]; sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 1)] = sum0; bucket_end[j] = sum1; } } static void libsais_radix_sort_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais_prefetchr(&SA[i - 2 * prefetch_distance]); libsais_prefetchr(&T[SA[i - prefetch_distance - 0]]); libsais_prefetchr(&T[SA[i - prefetch_distance - 1]]); libsais_prefetchr(&T[SA[i - prefetch_distance - 2]]); libsais_prefetchr(&T[SA[i - prefetch_distance - 3]]); sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; } } static void libsais_radix_sort_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t flags, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (flags & LIBSAIS_FLAGS_GSA) { buckets[4 * ALPHABET_SIZE]--; } #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && m >= 65536 && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { libsais_radix_sort_lms_suffixes_8u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); } #if defined(LIBSAIS_OPENMP) else { { sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets; fast_sint_t i, j; for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1); i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0)) { dst_bucket[i] = src_bucket[i] - dst_bucket[j]; } } { fast_sint_t t, omp_block_start = 0, omp_block_size = thread_state[omp_thread_num].state.m; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) omp_block_start += thread_state[t].state.m; if (omp_block_start == (fast_sint_t)m && omp_block_size > 0) { omp_block_start -= 1; omp_block_size -= 1; } libsais_radix_sort_lms_suffixes_8u(T, SA, thread_state[omp_thread_num].state.buckets, (fast_sint_t)n - omp_block_start, omp_block_size); } } #endif } } static void libsais_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) { libsais_prefetchr(&SA[i - 3 * prefetch_distance]); libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 0]]); libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 1]]); libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 2]]); libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 3]]); libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]); libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]); libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]); libsais_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]); sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[T[p0]]] = p0; sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[T[p1]]] = p1; sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[T[p2]]] = p2; sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[T[p3]]] = p3; } for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[--induction_bucket[T[p]]] = p; } } static void libsais_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) { libsais_prefetchr(&SA[i - 3 * prefetch_distance]); libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 0]]); libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 1]]); libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 2]]); libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 3]]); libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]); libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]); libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]); libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]); sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; } for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; } } #if defined(LIBSAIS_OPENMP) static void libsais_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais_prefetchr(&SA[i + 2 * prefetch_distance]); libsais_prefetchr(&T[SA[i + prefetch_distance + 0]]); libsais_prefetchr(&T[SA[i + prefetch_distance + 1]]); libsais_prefetchr(&T[SA[i + prefetch_distance + 2]]); libsais_prefetchr(&T[SA[i + prefetch_distance + 3]]); libsais_prefetchw(&cache[i + prefetch_distance]); cache[i + 0].symbol = T[cache[i + 0].index = SA[i + 0]]; cache[i + 1].symbol = T[cache[i + 1].index = SA[i + 1]]; cache[i + 2].symbol = T[cache[i + 2].index = SA[i + 2]]; cache[i + 3].symbol = T[cache[i + 3].index = SA[i + 3]]; } for (j += prefetch_distance + 3; i < j; i += 1) { cache[i].symbol = T[cache[i].index = SA[i]]; } } static void libsais_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais_prefetchw(&cache[i - 2 * prefetch_distance]); libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]); libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]); libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]); libsais_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]); cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol]; cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol]; cache[i - 2].symbol = --induction_bucket[cache[i - 2].symbol]; cache[i - 3].symbol = --induction_bucket[cache[i - 3].symbol]; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { cache[i].symbol = --induction_bucket[cache[i].symbol]; } } static void libsais_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais_prefetchw(&cache[i - 2 * prefetch_distance]); libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]); libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]); libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]); libsais_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]); cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)]; cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)]; cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)]; cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)]; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)]; } } static void libsais_radix_sort_lms_suffixes_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais_radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } static void libsais_radix_sort_lms_suffixes_32s_2k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais_radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static void libsais_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || m < 65536) { libsais_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; } libsais_radix_sort_lms_suffixes_32s_6k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static void libsais_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || m < 65536) { libsais_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; } libsais_radix_sort_lms_suffixes_32s_2k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static sa_sint_t libsais_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; sa_sint_t i = n - 2; sa_sint_t m = 0; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; fast_sint_t c2 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais_prefetchr(&T[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[T[i - prefetch_distance - 0]]); libsais_prefetchw(&buckets[T[i - prefetch_distance - 1]]); libsais_prefetchw(&buckets[T[i - prefetch_distance - 2]]); libsais_prefetchw(&buckets[T[i - prefetch_distance - 3]]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); if (f1 & ~f0) { SA[--buckets[c2 = c0]] = i + 1; m++; } c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); if (f0 & ~f1) { SA[--buckets[c2 = c1]] = i - 0; m++; } c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); if (f1 & ~f0) { SA[--buckets[c2 = c0]] = i - 1; m++; } c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); if (f0 & ~f1) { SA[--buckets[c2 = c1]] = i - 2; m++; } } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); if (f0 & ~f1) { SA[--buckets[c2 = c1]] = i + 1; m++; } } if (m > 1) { SA[buckets[c2]] = 0; } return m; } static void libsais_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais_prefetchr(&induction_bucket[i + 2 * prefetch_distance]); libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]); libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]); libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 2]]); libsais_prefetchw(&SA[induction_bucket[i + prefetch_distance + 3]]); SA[induction_bucket[i + 0]] |= SAINT_MIN; SA[induction_bucket[i + 1]] |= SAINT_MIN; SA[induction_bucket[i + 2]] |= SAINT_MIN; SA[induction_bucket[i + 3]] |= SAINT_MIN; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[induction_bucket[i]] |= SAINT_MIN; } } static void libsais_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais_prefetchr(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]); libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]); libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]); libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]); libsais_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]); SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER; SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER; SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= SUFFIX_GROUP_MARKER; SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER; } } static void libsais_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)k - 1; #endif libsais_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size); } } static void libsais_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)k - 1; #endif libsais_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size); } } static void libsais_initialize_buckets_for_partial_sorting_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++; fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0; for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)]; sum1 += buckets[i + BUCKETS_INDEX4(0, 1)]; buckets[j + BUCKETS_INDEX2(0, 0)] = sum0; buckets[j + BUCKETS_INDEX2(0, 1)] = sum1; } } static void libsais_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0; for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i != BUCKETS_INDEX4((fast_sint_t)first_lms_suffix, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)]; sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)]; sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)]; sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)]; buckets[i + BUCKETS_INDEX4(0, 0)] = sum0; buckets[i + BUCKETS_INDEX4(0, 1)] = sum2; buckets[i + BUCKETS_INDEX4(0, 2)] = 0; buckets[i + BUCKETS_INDEX4(0, 3)] = 0; sum0 += SS + SL; sum1 += LS; sum2 += LS + LL; temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1; } for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)]; sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)]; sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)]; sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)]; buckets[i + BUCKETS_INDEX4(0, 0)] = sum0; buckets[i + BUCKETS_INDEX4(0, 1)] = sum2; buckets[i + BUCKETS_INDEX4(0, 2)] = 0; buckets[i + BUCKETS_INDEX4(0, 3)] = 0; sum0 += SS + SL; sum1 += LS; sum2 += LS + LL; temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1; } } static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchr(&SA[i + 2 * prefetch_distance]); libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); SA[induction_bucket[v0]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); SA[induction_bucket[v1]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); SA[induction_bucket[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } return d; } #if defined(LIBSAIS_OPENMP) static void libsais_partial_sorting_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; memset(induction_bucket, 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); memset(distinct_names , 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchr(&SA[i + 2 * prefetch_distance]); libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = cache[count].index = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; sa_sint_t p1 = cache[count].index = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); induction_bucket[v]++; distinct_names[v] = d; } state[0].state.position = (fast_sint_t)d - 1; state[0].state.count = count; } static void libsais_partial_sorting_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { libsais_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; SA[induction_bucket[v0]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; SA[induction_bucket[v1]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j += 1; i < j; i += 1) { sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; SA[induction_bucket[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais_partial_sorting_scan_left_to_right_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); } #pragma omp barrier #pragma omp master { sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; fast_sint_t c; for (c = 0; c < 2 * k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A + B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * k; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; } } #pragma omp barrier { libsais_partial_sorting_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); } } #endif } return d; } #endif static sa_sint_t libsais_partial_sorting_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN; distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; if (threads == 1 || left_suffixes_count < 65536) { d = libsais_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, 0, left_suffixes_count); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < left_suffixes_count; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > left_suffixes_count) { block_max_end = left_suffixes_count;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); SA[induction_bucket[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } else { d = libsais_partial_sorting_scan_left_to_right_8u_block_omp(T, SA, k, buckets, d, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif return d; } static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais_prefetchr(&SA[i + 3 * prefetch_distance]); libsais_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2); libsais_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais_prefetchw(&buckets[v0]); sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais_prefetchw(&buckets[v1]); sa_sint_t p2 = SA[i + 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]); SA[buckets[v2]++] = (p2 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d; sa_sint_t p3 = SA[i + 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]); SA[buckets[v3]++] = (p3 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d; } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); SA[buckets[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; } return d; } static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 3 * prefetch_distance]); sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; } } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); SA[induction_bucket[T[p - 1]]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; } } return d; } static void libsais_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 3 * prefetch_distance]); sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); } } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static void libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchr(&SA[i + 2 * prefetch_distance]); libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); libsais_prefetchw(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); } cache[i + 0].symbol = symbol0; sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); } cache[i].symbol = symbol; } } static void libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); libsais_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX; } } static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); libsais_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX; } } static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&cache[i + 2 * prefetch_distance]); libsais_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]); libsais_prefetchw(&buckets[cache[i + prefetch_distance + 1].symbol]); sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index; d += (p0 < 0); cache[i + 0].symbol = buckets[v0]++; cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t s = cache[i + 0].symbol, q = (cache[s].index = cache[i + 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index; d += (p1 < 0); cache[i + 1].symbol = buckets[v1]++; cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t s = cache[i + 1].symbol, q = (cache[s].index = cache[i + 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = buckets[v]++; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; if (cache[i].symbol < omp_block_end) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } } return d; } static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&cache[i + 2 * prefetch_distance]); sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 >> 1 : 0]; libsais_prefetchw(Is0); const sa_sint_t * Ds0 = &distinct_names[s0 > 0 ? s0 : 0]; libsais_prefetchw(Ds0); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 >> 1 : 0]; libsais_prefetchw(Is1); const sa_sint_t * Ds1 = &distinct_names[s1 > 0 ? s1 : 0]; libsais_prefetchw(Ds1); sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { sa_sint_t p0 = cache[i + 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 0].symbol = induction_bucket[v0 >> 1]++; cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 0].index = np & SAINT_MAX; } } sa_sint_t v1 = cache[i + 1].symbol; if (v1 >= 0) { sa_sint_t p1 = cache[i + 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 1].symbol = induction_bucket[v1 >> 1]++; cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 1].index = np & SAINT_MAX; } } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = induction_bucket[v >> 1]++; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i].index = np & SAINT_MAX; } } } return d; } static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&cache[i + 2 * prefetch_distance]); sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais_prefetchw(Is0); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais_prefetchw(Is1); sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { cache[i + 0].symbol = induction_bucket[v0]++; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 0].index = np & SAINT_MAX; } } sa_sint_t v1 = cache[i + 1].symbol; if (v1 >= 0) { cache[i + 1].symbol = induction_bucket[v1]++; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 1].index = np & SAINT_MAX; } } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = induction_bucket[v]++; if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i].index = np & SAINT_MAX; } } } } static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais_partial_sorting_scan_left_to_right_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais_partial_sorting_scan_left_to_right_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static void libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais_partial_sorting_scan_left_to_right_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais_partial_sorting_scan_left_to_right_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN; buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; if (threads == 1 || left_suffixes_count < 65536) { d = libsais_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < left_suffixes_count; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > left_suffixes_count) { block_end = left_suffixes_count; } d = libsais_partial_sorting_scan_left_to_right_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif return d; } static sa_sint_t libsais_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; SA[induction_bucket[T[n - 1]]++] = (n - 1) | (sa_sint_t)((sa_uint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER; distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d; if (threads == 1 || n < 65536) { d = libsais_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < n; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } d = libsais_partial_sorting_scan_left_to_right_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif return d; } static void libsais_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[buckets[T[n - 1]]++] = (n - 1) | (sa_sint_t)((sa_uint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < n; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } libsais_partial_sorting_scan_left_to_right_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static void libsais_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets, sa_sint_t threads) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; fast_sint_t c; #if defined(LIBSAIS_OPENMP) #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536) #else UNUSED(threads); UNUSED(n); #endif for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0)) { fast_sint_t i, j; sa_sint_t s = SAINT_MIN; for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4) { libsais_prefetchw(&SA[i - prefetch_distance]); sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; } for (j -= 3; i >= j; i -= 1) { sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; } } } static void libsais_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets, sa_sint_t threads) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t c; #if defined(LIBSAIS_OPENMP) #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && k >= 65536) #else UNUSED(threads); #endif for (c = (fast_sint_t)k - 1; c >= 1; c -= 1) { fast_sint_t i, j; sa_sint_t s = SAINT_MIN; for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1, j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3; i >= j; i -= 4) { libsais_prefetchw(&SA[i - prefetch_distance]); sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; } for (j -= 3; i >= j; i -= 1) { sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; } } } static void libsais_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; fast_sint_t i; sa_sint_t s = SUFFIX_GROUP_MARKER; for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) { libsais_prefetchw(&SA[i - prefetch_distance]); sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0; sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1; sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2; sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3; } for (; i >= 0; i -= 1) { sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q; } } static void libsais_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) { buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)]; buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)]; } } static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchr(&SA[i - 2 * prefetch_distance]); libsais_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); libsais_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } return d; } static sa_sint_t libsais_partial_gsa_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchr(&SA[i - 2 * prefetch_distance]); libsais_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); libsais_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); if (v0 != 1) { SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); if (v1 != 1) { SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); if (v != 1) { SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } return d; } #if defined(LIBSAIS_OPENMP) static void libsais_partial_sorting_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; memset(induction_bucket, 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); memset(distinct_names , 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchr(&SA[i - 2 * prefetch_distance]); libsais_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); libsais_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = cache[count].index = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; sa_sint_t p1 = cache[count].index = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); induction_bucket[v]++; distinct_names[v] = d; } state[0].state.position = (fast_sint_t)d - 1; state[0].state.count = count; } static void libsais_partial_sorting_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { libsais_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j += 1; i < j; i += 1) { sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } static void libsais_partial_gsa_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { libsais_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; if (v0 != 1) { SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; if (v1 != 1) { SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } } for (j += 1; i < j; i += 1) { sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; if (v != 1) { SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } } static sa_sint_t libsais_partial_sorting_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais_partial_sorting_scan_right_to_left_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); } #pragma omp barrier #pragma omp master { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; fast_sint_t c; for (c = 0; c < 2 * k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * k; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; } } #pragma omp barrier { libsais_partial_sorting_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); } } #endif } return d; } static sa_sint_t libsais_partial_gsa_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais_partial_gsa_scan_right_to_left_8u(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais_partial_sorting_scan_right_to_left_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); } #pragma omp barrier #pragma omp master { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; fast_sint_t c; for (c = 0; c < 2 * k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * k; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; } } #pragma omp barrier { libsais_partial_gsa_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); } } #endif } return d; } #endif static void libsais_partial_sorting_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; if (threads == 1 || (scan_end - scan_start) < 65536) { libsais_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start, scan_end - scan_start); } #if defined(LIBSAIS_OPENMP) else { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t block_start; for (block_start = scan_end - 1; block_start >= scan_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } else { d = libsais_partial_sorting_scan_right_to_left_8u_block_omp(T, SA, k, buckets, d, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais_partial_gsa_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; if (threads == 1 || (scan_end - scan_start) < 65536) { libsais_partial_gsa_scan_right_to_left_8u(T, SA, buckets, d, scan_start, scan_end - scan_start); } #if defined(LIBSAIS_OPENMP) else { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t block_start; for (block_start = scan_end - 1; block_start >= scan_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); if (v != 1) { SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } } else { d = libsais_partial_gsa_scan_right_to_left_8u_block_omp(T, SA, k, buckets, d, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchr(&SA[i - 3 * prefetch_distance]); libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2); libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais_prefetchw(&buckets[v0]); sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais_prefetchw(&buckets[v1]); sa_sint_t p2 = SA[i - 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]); SA[--buckets[v2]] = (p2 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d; sa_sint_t p3 = SA[i - 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]); SA[--buckets[v3]] = (p3 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d; } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); SA[--buckets[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; } return d; } static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&SA[i - 3 * prefetch_distance]); sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts2]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais_prefetchw(&induction_bucket[Ts3]); libsais_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; } } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); SA[--induction_bucket[T[p - 1]]] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; } } return d; } static void libsais_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&SA[i - 3 * prefetch_distance]); sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); } } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static void libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchr(&SA[i + 2 * prefetch_distance]); libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); libsais_prefetchw(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0; sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol; } } static void libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); libsais_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol; } } static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); libsais_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; } cache[i].symbol = symbol; } } static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&cache[i - 2 * prefetch_distance]); libsais_prefetchw(&buckets[cache[i - prefetch_distance - 0].symbol]); libsais_prefetchw(&buckets[cache[i - prefetch_distance - 1].symbol]); sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index; d += (p0 < 0); cache[i - 0].symbol = --buckets[v0]; cache[i - 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t s = cache[i - 0].symbol, q = (cache[s].index = cache[i - 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index; d += (p1 < 0); cache[i - 1].symbol = --buckets[v1]; cache[i - 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t s = cache[i - 1].symbol, q = (cache[s].index = cache[i - 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = --buckets[v]; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; if (cache[i].symbol >= omp_block_start) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } } return d; } static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&cache[i - 2 * prefetch_distance]); sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 >> 1 : 0]; libsais_prefetchw(Is0); const sa_sint_t * Ds0 = &distinct_names[s0 > 0 ? s0 : 0]; libsais_prefetchw(Ds0); sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 >> 1 : 0]; libsais_prefetchw(Is1); const sa_sint_t * Ds1 = &distinct_names[s1 > 0 ? s1 : 0]; libsais_prefetchw(Ds1); sa_sint_t v0 = cache[i - 0].symbol; if (v0 >= 0) { sa_sint_t p0 = cache[i - 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 0].symbol = --induction_bucket[v0 >> 1]; cache[i - 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } } sa_sint_t v1 = cache[i - 1].symbol; if (v1 >= 0) { sa_sint_t p1 = cache[i - 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 1].symbol = --induction_bucket[v1 >> 1]; cache[i - 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = --induction_bucket[v >> 1]; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } } } return d; } static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&cache[i - 2 * prefetch_distance]); sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais_prefetchw(Is0); sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais_prefetchw(Is1); sa_sint_t v0 = cache[i - 0].symbol; if (v0 >= 0) { cache[i - 0].symbol = --induction_bucket[v0]; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } } } sa_sint_t v1 = cache[i - 1].symbol; if (v1 >= 0) { cache[i - 1].symbol = --induction_bucket[v1]; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; }} } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = --induction_bucket[v]; if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } } } } } static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais_partial_sorting_scan_right_to_left_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais_partial_sorting_scan_right_to_left_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static void libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais_partial_sorting_scan_right_to_left_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais_partial_sorting_scan_right_to_left_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; if (threads == 1 || (scan_end - scan_start) < 65536) { d = libsais_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < scan_start) { block_end = scan_start - 1; } d = libsais_partial_sorting_scan_right_to_left_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif return d; } static sa_sint_t libsais_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { d = libsais_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } d = libsais_partial_sorting_scan_right_to_left_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif return d; } static void libsais_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { libsais_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } libsais_partial_sorting_scan_right_to_left_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif } static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { libsais_prefetchr(&SA[i + prefetch_distance]); sa_uint_t s0 = (sa_uint_t)SA[i + 0]; SA[l] = (sa_sint_t)((s0 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s0 < 0); sa_uint_t s1 = (sa_uint_t)SA[i + 1]; SA[l] = (sa_sint_t)((s1 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s1 < 0); sa_uint_t s2 = (sa_uint_t)SA[i + 2]; SA[l] = (sa_sint_t)((s2 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s2 < 0); sa_uint_t s3 = (sa_uint_t)SA[i + 3]; SA[l] = (sa_sint_t)((s3 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s3 < 0); } for (j += 3; i < j; i += 1) { sa_uint_t s = (sa_uint_t)SA[i]; SA[l] = (sa_sint_t)((s - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s < 0); } return l; } static fast_sint_t libsais_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { libsais_prefetchr(&SA[i + prefetch_distance]); sa_sint_t s0 = SA[i + 0]; SA[l] = s0 & SAINT_MAX; l += (s0 < 0); sa_sint_t s1 = SA[i + 1]; SA[l] = s1 & SAINT_MAX; l += (s1 < 0); sa_sint_t s2 = SA[i + 2]; SA[l] = s2 & SAINT_MAX; l += (s2 < 0); sa_sint_t s3 = SA[i + 3]; SA[l] = s3 & SAINT_MAX; l += (s3 < 0); } for (j += 3; i < j; i += 1) { sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l += (s < 0); } return l; } static void libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = omp_block_start; thread_state[omp_thread_num].state.count = libsais_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size) - omp_block_start; } #pragma omp barrier #pragma omp master { fast_sint_t t, position = 0; for (t = 0; t < omp_num_threads; ++t) { if (t > 0 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } position += thread_state[t].state.count; } } } #endif } } static void libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = omp_block_start; thread_state[omp_thread_num].state.count = libsais_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size) - omp_block_start; } #pragma omp barrier #pragma omp master { fast_sint_t t, position = 0; for (t = 0; t < omp_num_threads; ++t) { if (t > 0 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } position += thread_state[t].state.count; } } } #endif } } static void libsais_induce_partial_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t flags, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(&buckets[2 * ALPHABET_SIZE], 0, (size_t)2 * ALPHABET_SIZE * sizeof(sa_sint_t)); if (flags & LIBSAIS_FLAGS_GSA) { buckets[4 * ALPHABET_SIZE + BUCKETS_INDEX2(0, 1)] = buckets[4 * ALPHABET_SIZE + BUCKETS_INDEX2(1, 1)] - 1; libsais_flip_suffix_markers_omp(SA, buckets[4 * ALPHABET_SIZE + BUCKETS_INDEX2(0, 1)], threads); } sa_sint_t d = libsais_partial_sorting_scan_left_to_right_8u_omp(T, SA, n, k, buckets, left_suffixes_count, 0, threads, thread_state); libsais_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads); if (flags & LIBSAIS_FLAGS_GSA) { libsais_partial_gsa_scan_right_to_left_8u_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); if (T[first_lms_suffix] == 0) { memmove(&SA[1], &SA[0], (size_t)(buckets[BUCKETS_INDEX2(1, 1)] - 1) * sizeof(sa_sint_t)); SA[0] = first_lms_suffix | SAINT_MIN; } buckets[BUCKETS_INDEX2(0, 1)] = 0; } else { libsais_partial_sorting_scan_right_to_left_8u_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); } } static void libsais_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state); libsais_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads); libsais_partial_sorting_shift_buckets_32s_6k(k, buckets); libsais_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); } static void libsais_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t d = libsais_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state); libsais_partial_sorting_shift_markers_32s_4k(SA, n); libsais_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state); libsais_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state); } static void libsais_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); } static void libsais_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais_count_suffixes_32s(T, n, k, buckets); libsais_initialize_buckets_start_32s_1k(k, buckets); libsais_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state); libsais_count_suffixes_32s(T, n, k, buckets); libsais_initialize_buckets_end_32s_1k(k, buckets); libsais_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state); libsais_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); } static sa_sint_t libsais_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais_prefetchr(&SA[i + 2 * prefetch_distance]); libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]); libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]); libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]); libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]); sa_sint_t p0 = SA[i + 0]; SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p0 < 0; sa_sint_t p1 = SA[i + 1]; SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p1 < 0; sa_sint_t p2 = SA[i + 2]; SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p2 < 0; sa_sint_t p3 = SA[i + 3]; SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p3 < 0; } for (j += prefetch_distance + 3; i < j; i += 1) { sa_sint_t p = SA[i]; SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p < 0; } return name; } static fast_sint_t libsais_gather_marked_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; l -= 1; fast_sint_t i, j; for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) { libsais_prefetchr(&SA[i - prefetch_distance]); sa_sint_t s0 = SA[i - 0]; SA[l] = s0 & SAINT_MAX; l -= s0 < 0; sa_sint_t s1 = SA[i - 1]; SA[l] = s1 & SAINT_MAX; l -= s1 < 0; sa_sint_t s2 = SA[i - 2]; SA[l] = s2 & SAINT_MAX; l -= s2 < 0; sa_sint_t s3 = SA[i - 3]; SA[l] = s3 & SAINT_MAX; l -= s3 < 0; } for (j -= 3; i >= j; i -= 1) { sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l -= s < 0; } l += 1; return l; } static sa_sint_t libsais_renumber_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t name = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { name = libsais_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } if (omp_thread_num == omp_num_threads - 1) { name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); } libsais_renumber_lms_suffixes_8u(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); } } #endif } return name; } static void libsais_gather_marked_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; if (omp_num_threads == 1) { libsais_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { if (omp_thread_num < omp_num_threads - 1) { thread_state[omp_thread_num].state.position = libsais_gather_marked_lms_suffixes(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size); thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position; } else { thread_state[omp_thread_num].state.position = libsais_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size); thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position; } } #pragma omp barrier #pragma omp master { fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs; for (t = omp_num_threads - 1; t >= 0; --t) { position -= thread_state[t].state.count; if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } } } } #endif } } static sa_sint_t libsais_renumber_and_gather_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t)); sa_sint_t name = libsais_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state); if (name < m) { libsais_gather_marked_lms_suffixes_omp(SA, n, m, fs, threads, thread_state); } else { fast_sint_t i; for (i = 0; i < m; i += 1) { SA[i] &= SAINT_MAX; } } return name; } static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]); libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]); libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]); libsais_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]); p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN); name += p0 < 0; p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN); name += p1 < 0; p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN); name += p2 < 0; p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0; } for (j += prefetch_distance + 3; i < j; i += 1) { p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0; } return name; } static void libsais_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0; for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j; i += 4) { libsais_prefetchw(&SA[i + prefetch_distance]); p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | SAINT_MAX); p0 = (p0 == 0) ? p3 : p0; p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | SAINT_MAX); p1 = (p1 == 0) ? p0 : p1; p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | SAINT_MAX); p2 = (p2 == 0) ? p1 : p2; p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3; } for (j += 3; i < j; i += 1) { p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3; } } static void libsais_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) { libsais_prefetchw(&SAm[i + prefetch_distance]); SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX; SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX; SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX; SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX; } for (j += 3; i < j; i += 1) { SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX; } } static sa_sint_t libsais_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t name = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { name = libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 1; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } if (omp_thread_num == omp_num_threads - 1) { name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); } libsais_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); } } #endif } return name - 1; } static void libsais_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)n >> 1; #endif libsais_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size); } } static void libsais_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)n >> 1; #endif libsais_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size); } } static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t)); sa_sint_t name = libsais_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state); if (name < m) { libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads); } return name; } static sa_sint_t libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; { libsais_gather_lms_suffixes_32s(T, SA, n); memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t)); fast_sint_t i, j; for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4) { libsais_prefetchr(&SA[i + 2 * prefetch_distance]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN; SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN; SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN; SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN; } for (j += prefetch_distance + 3; i < j; i += 1) { SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN; } SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN; } { libsais_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads); } sa_sint_t name = 1; { fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1]; sa_sint_t pdiff = SAINT_MIN; for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchr(&SA[i + 2 * prefetch_distance]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais_prefetchr(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais_prefetchr(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]); fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN; if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN; } SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = SAINT_MIN; if (qlen == plen) { fast_sint_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (sa_sint_t)(l - plen) & SAINT_MIN; } SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0); } for (j += prefetch_distance + 1; i < j; i += 1) { fast_sint_t q = SA[i], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN; if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (sa_sint_t)(l - plen) & SAINT_MIN; } SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); p = q; plen = qlen; pdiff = qdiff; } SAm[p >> 1] = name | pdiff; name++; } if (name <= m) { libsais_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads); } return name - 1; } static void libsais_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT SAnm = &SA[n - m]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); libsais_prefetchr(&SAnm[SA[i + prefetch_distance + 0]]); libsais_prefetchr(&SAnm[SA[i + prefetch_distance + 1]]); libsais_prefetchr(&SAnm[SA[i + prefetch_distance + 2]]); libsais_prefetchr(&SAnm[SA[i + prefetch_distance + 3]]); SA[i + 0] = SAnm[SA[i + 0]]; SA[i + 1] = SAnm[SA[i + 1]]; SA[i + 2] = SAnm[SA[i + 2]]; SA[i + 3] = SAnm[SA[i + 3]]; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[i] = SAnm[SA[i]]; } } static void libsais_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = m; #endif libsais_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size); } } static void libsais_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t flags, sa_sint_t * RESTRICT buckets) { if (flags & LIBSAIS_FLAGS_GSA) { buckets[7 * ALPHABET_SIZE]--; } { const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; fast_sint_t c, j = n; for (c = ALPHABET_SIZE - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } if (flags & LIBSAIS_FLAGS_GSA) { buckets[7 * ALPHABET_SIZE]++; } } static void libsais_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { fast_sint_t j = n; if (k > 1) { fast_sint_t c; for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) { fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)]; if (l > 0) { fast_sint_t i = buckets[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; sa_sint_t c = k - 1; fast_sint_t i, l = buckets[c]; for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4) { libsais_prefetchr(&SA[i - 2 * prefetch_distance]); libsais_prefetchr(&T[SA[i - prefetch_distance - 0]]); libsais_prefetchr(&T[SA[i - prefetch_distance - 1]]); libsais_prefetchr(&T[SA[i - prefetch_distance - 2]]); libsais_prefetchr(&T[SA[i - prefetch_distance - 3]]); sa_sint_t p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p0; sa_sint_t p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p1; sa_sint_t p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p2; sa_sint_t p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p3; } for (; i >= 0; i -= 1) { sa_sint_t p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p; } memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t)); } static void libsais_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { const sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { fast_sint_t j = n; if (k > 1) { fast_sint_t c; for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) { fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)]; if (l > 0) { fast_sint_t i = buckets[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais_final_bwt_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } static void libsais_final_bwt_aux_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]]; }} sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]]; }} } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } } } } static void libsais_final_sorting_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } static void libsais_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 3 * prefetch_distance]); sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static fast_sint_t libsais_final_bwt_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } return count; } static fast_sint_t libsais_final_sorting_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } return count; } static void libsais_final_order_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais_prefetchr(&cache[i + prefetch_distance]); SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; } for (j += 3; i < j; i += 1) { SA[buckets[cache[i].symbol]++] = cache[i].index; } } static void libsais_final_bwt_aux_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais_prefetchr(&cache[i + prefetch_distance]); SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; if ((cache[i + 0].index & rm) == 0) { I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol]; } SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol]; } SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; if ((cache[i + 2].index & rm) == 0) { I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol]; } SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; if ((cache[i + 3].index & rm) == 0) { I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol]; } } for (j += 3; i < j; i += 1) { SA[buckets[cache[i].symbol]++] = cache[i].index; if ((cache[i].index & rm) == 0) { I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol]; } } } static void libsais_final_sorting_scan_left_to_right_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); libsais_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; cache[i + 0].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; cache[i + 1].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; cache[i].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol; } } static void libsais_final_sorting_scan_left_to_right_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&cache[i + 2 * prefetch_distance]); sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais_prefetchw(Is0); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais_prefetchw(Is1); sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { cache[i + 0].symbol = induction_bucket[v0]++; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; cache[i + 0].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } sa_sint_t v1 = cache[i + 1].symbol; if (v1 >= 0) { cache[i + 1].symbol = induction_bucket[v1]++; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; cache[i + 1].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = induction_bucket[v]++; if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } } static void libsais_final_bwt_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_left_to_right_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais_final_order_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_left_to_right_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais_final_bwt_aux_scan_left_to_right_8u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais_final_sorting_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais_final_sorting_scan_left_to_right_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais_final_order_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais_final_sorting_scan_left_to_right_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais_final_sorting_scan_left_to_right_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais_final_sorting_scan_left_to_right_32s_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static void libsais_final_bwt_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | (sa_sint_t)((sa_uint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < n; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } else { libsais_final_bwt_scan_left_to_right_8u_block_omp(T, SA, k, induction_bucket, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais_final_bwt_aux_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | (sa_sint_t)((sa_uint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); if ((((sa_sint_t)n - 1) & rm) == 0) { I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]]; } if (threads == 1 || n < 65536) { libsais_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < n; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } } } } else { libsais_final_bwt_aux_scan_left_to_right_8u_block_omp(T, SA, k, rm, I, induction_bucket, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais_final_sorting_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | (sa_sint_t)((sa_uint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < n; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } else { libsais_final_sorting_scan_left_to_right_8u_block_omp(T, SA, k, induction_bucket, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[n - 1]]++] = (n - 1) | (sa_sint_t)((sa_uint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < n; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } libsais_final_sorting_scan_left_to_right_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static sa_sint_t libsais_final_bwt_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; sa_sint_t index = -1; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; index = (p0 == 0) ? (sa_sint_t)(i - 0) : index; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; } sa_sint_t p1 = SA[i - 1]; index = (p1 == 0) ? (sa_sint_t)(i - 1) : index; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; index = (p == 0) ? (sa_sint_t)i : index; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } } return index; } static void libsais_final_bwt_aux_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1; } } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1; } } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } } } } static void libsais_final_sorting_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } static void libsais_final_gsa_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0 && T[p0 - 1] > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0 && T[p1 - 1] > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0 && T[p - 1] > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } static void libsais_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&SA[i - 3 * prefetch_distance]); sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais_prefetchw(&induction_bucket[T[s2 - 1]]); libsais_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais_prefetchw(&induction_bucket[T[s3 - 1]]); libsais_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static fast_sint_t libsais_final_bwt_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p0 : t; } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p1 : t; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p : t; } } return count; } static fast_sint_t libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p0 : t; cache[count + 1].index = p0; count += 2; } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p1 : t; cache[count + 1].index = p1; count += 2; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p : t; cache[count + 1].index = p; count += 2; } } return count; } static fast_sint_t libsais_final_sorting_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } return count; } static void libsais_final_order_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais_prefetchr(&cache[i + prefetch_distance]); SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index; SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index; } for (j += 3; i < j; i += 1) { SA[--buckets[cache[i].symbol]] = cache[i].index; } } static void libsais_final_gsa_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais_prefetchr(&cache[i + prefetch_distance]); if (cache[i + 0].symbol > 0) { SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; } if (cache[i + 1].symbol > 0) { SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index; } if (cache[i + 2].symbol > 0) { SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; } if (cache[i + 3].symbol > 0) { SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index; } } for (j += 3; i < j; i += 1) { if (cache[i].symbol > 0) { SA[--buckets[cache[i].symbol]] = cache[i].index; } } } static void libsais_final_bwt_aux_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 6; i < j; i += 8) { libsais_prefetchr(&cache[i + prefetch_distance]); SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; if ((cache[i + 1].index & rm) == 0) { I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1; } SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; if ((cache[i + 3].index & rm) == 0) { I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1; } SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index; if ((cache[i + 5].index & rm) == 0) { I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1; } SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index; if ((cache[i + 7].index & rm) == 0) { I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1; } } for (j += 6; i < j; i += 2) { SA[--buckets[cache[i].symbol]] = cache[i].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1; } } } static void libsais_final_sorting_scan_right_to_left_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais_prefetchr(Ts0 - 1); libsais_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais_prefetchr(Ts1 - 1); libsais_prefetchr(Ts1 - 2); libsais_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; cache[i + 0].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; cache[i + 1].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; cache[i].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol; } } static void libsais_final_sorting_scan_right_to_left_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais_prefetchw(&cache[i - 2 * prefetch_distance]); sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais_prefetchw(Is0); sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais_prefetchw(Is1); sa_sint_t v0 = cache[i - 0].symbol; if (v0 >= 0) { cache[i - 0].symbol = --induction_bucket[v0]; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; cache[i - 0].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } sa_sint_t v1 = cache[i - 1].symbol; if (v1 >= 0) { cache[i - 1].symbol = --induction_bucket[v1]; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; cache[i - 1].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = --induction_bucket[v]; if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } } static void libsais_final_bwt_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais_final_bwt_scan_right_to_left_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais_final_order_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais_final_bwt_aux_scan_right_to_left_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais_final_bwt_aux_scan_right_to_left_8u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais_final_sorting_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais_final_sorting_scan_right_to_left_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais_final_order_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais_final_gsa_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais_final_gsa_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais_final_sorting_scan_right_to_left_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais_final_gsa_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais_final_sorting_scan_right_to_left_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais_final_sorting_scan_right_to_left_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais_final_sorting_scan_right_to_left_32s_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static sa_sint_t libsais_final_bwt_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t index = -1; if (threads == 1 || n < 65536) { index = libsais_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) { if (SA[block_start] == 0) { index = (sa_sint_t)block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < 0) { block_max_end = -1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } } } else { libsais_final_bwt_scan_right_to_left_8u_block_omp(T, SA, k, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif return index; } static void libsais_final_bwt_aux_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { libsais_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2); if (block_max_end < 0) { block_max_end = -1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } } } } else { libsais_final_bwt_aux_scan_right_to_left_8u_block_omp(T, SA, k, rm, I, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais_final_sorting_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || omp_block_size < 65536) { libsais_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = omp_block_start + omp_block_size - 1; block_start >= omp_block_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < omp_block_start) { block_max_end = omp_block_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } else { libsais_final_sorting_scan_right_to_left_8u_block_omp(T, SA, k, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais_final_gsa_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || omp_block_size < 65536) { libsais_final_gsa_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = omp_block_start + omp_block_size - 1; block_start >= omp_block_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < omp_block_start) { block_max_end = omp_block_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0 && T[p - 1] > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } else { libsais_final_gsa_scan_right_to_left_8u_block_omp(T, SA, k, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { libsais_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } libsais_final_sorting_scan_right_to_left_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif } static void libsais_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end, sa_sint_t threads) { fast_sint_t c; #if defined(LIBSAIS_OPENMP) #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536) #else UNUSED(threads); UNUSED(n); #endif for (c = 0; c < k; ++c) { if (bucket_end[c] > bucket_start[c]) { memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t)); } } } static sa_sint_t libsais_induce_final_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t flags, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if ((flags & LIBSAIS_FLAGS_BWT) == 0) { if (flags & LIBSAIS_FLAGS_GSA) { buckets[6 * ALPHABET_SIZE] = buckets[7 * ALPHABET_SIZE] - 1; } libsais_final_sorting_scan_left_to_right_8u_omp(T, SA, n, k, &buckets[6 * ALPHABET_SIZE], threads, thread_state); if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } if (flags & LIBSAIS_FLAGS_GSA) { libsais_flip_suffix_markers_omp(SA, buckets[7 * ALPHABET_SIZE], threads); libsais_final_gsa_scan_right_to_left_8u_omp(T, SA, buckets[7 * ALPHABET_SIZE], (fast_sint_t)n - buckets[7 * ALPHABET_SIZE], k, &buckets[7 * ALPHABET_SIZE], threads, thread_state); } else { libsais_final_sorting_scan_right_to_left_8u_omp(T, SA, 0, n, k, &buckets[7 * ALPHABET_SIZE], threads, thread_state); } return 0; } else if (I != NULL) { libsais_final_bwt_aux_scan_left_to_right_8u_omp(T, SA, n, k, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state); if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } libsais_final_bwt_aux_scan_right_to_left_8u_omp(T, SA, n, k, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state); return 0; } else { libsais_final_bwt_scan_left_to_right_8u_omp(T, SA, n, k, &buckets[6 * ALPHABET_SIZE], threads, thread_state); if (threads > 1 && n >= 65536) { libsais_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } return libsais_final_bwt_scan_right_to_left_8u_omp(T, SA, n, k, &buckets[7 * ALPHABET_SIZE], threads, thread_state); } } static void libsais_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * (fast_sint_t)k], threads, thread_state); libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * (fast_sint_t)k], threads, thread_state); } static void libsais_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * (fast_sint_t)k], threads, thread_state); libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * (fast_sint_t)k], threads, thread_state); } static void libsais_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); } static void libsais_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais_count_suffixes_32s(T, n, k, buckets); libsais_initialize_buckets_start_32s_1k(k, buckets); libsais_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state); libsais_count_suffixes_32s(T, n, k, buckets); libsais_initialize_buckets_end_32s_1k(k, buckets); libsais_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state); } static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; sa_sint_t i, j; for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3; i < j; i += 4) { libsais_prefetchr(&SA[i + 3 * prefetch_distance]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]); libsais_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]); sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0]; sa_sint_t * RESTRICT Tq0 = &T[q0]; libsais_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : &SAm[q0 >> 1]); sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1]; sa_sint_t * RESTRICT Tq1 = &T[q1]; libsais_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : &SAm[q1 >> 1]); sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2]; sa_sint_t * RESTRICT Tq2 = &T[q2]; libsais_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : &SAm[q2 >> 1]); sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3]; sa_sint_t * RESTRICT Tq3 = &T[q3]; libsais_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : &SAm[q3 >> 1]); sa_uint_t p0 = (sa_uint_t)SA[i + 0]; sa_sint_t s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= SAINT_MIN; f++; s0 = i + 0 + SAINT_MIN + f; } SAm[p0 >> 1] = s0 - f; sa_uint_t p1 = (sa_uint_t)SA[i + 1]; sa_sint_t s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= SAINT_MIN; f++; s1 = i + 1 + SAINT_MIN + f; } SAm[p1 >> 1] = s1 - f; sa_uint_t p2 = (sa_uint_t)SA[i + 2]; sa_sint_t s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= SAINT_MIN; f++; s2 = i + 2 + SAINT_MIN + f; } SAm[p2 >> 1] = s2 - f; sa_uint_t p3 = (sa_uint_t)SA[i + 3]; sa_sint_t s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= SAINT_MIN; f++; s3 = i + 3 + SAINT_MIN + f; } SAm[p3 >> 1] = s3 - f; } for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1) { sa_uint_t p = (sa_uint_t)SA[i]; sa_sint_t s = SAm[p >> 1]; if (s < 0) { T[p] |= SAINT_MIN; f++; s = i + SAINT_MIN + f; } SAm[p >> 1] = s - f; } return f; } static void libsais_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_uint_t * RESTRICT SAl = (sa_uint_t *)&SA[0]; sa_uint_t * RESTRICT SAr = (sa_uint_t *)&SA[0]; fast_sint_t i, j, l = *pl - 1, r = *pr - 1; for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) { libsais_prefetchr(&SA[i - prefetch_distance]); sa_uint_t p0 = (sa_uint_t)SA[i - 0]; SAl[l] = p0 & SAINT_MAX; l -= (sa_sint_t)p0 < 0; SAr[r] = p0 - 1; r -= (sa_sint_t)p0 > 0; sa_uint_t p1 = (sa_uint_t)SA[i - 1]; SAl[l] = p1 & SAINT_MAX; l -= (sa_sint_t)p1 < 0; SAr[r] = p1 - 1; r -= (sa_sint_t)p1 > 0; sa_uint_t p2 = (sa_uint_t)SA[i - 2]; SAl[l] = p2 & SAINT_MAX; l -= (sa_sint_t)p2 < 0; SAr[r] = p2 - 1; r -= (sa_sint_t)p2 > 0; sa_uint_t p3 = (sa_uint_t)SA[i - 3]; SAl[l] = p3 & SAINT_MAX; l -= (sa_sint_t)p3 < 0; SAr[r] = p3 - 1; r -= (sa_sint_t)p3 > 0; } for (j -= 3; i >= j; i -= 1) { sa_uint_t p = (sa_uint_t)SA[i]; SAl[l] = p & SAINT_MAX; l -= (sa_sint_t)p < 0; SAr[r] = p - 1; r -= (sa_sint_t)p > 0; } *pl = l + 1; *pr = r + 1; } #if defined(LIBSAIS_OPENMP) static sa_sint_t libsais_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais_prefetchr(&SA[i + 2 * prefetch_distance]); libsais_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); libsais_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); f0 += SAm[((sa_uint_t)SA[i + 0]) >> 1] < 0; f1 += SAm[((sa_uint_t)SA[i + 1]) >> 1] < 0; f2 += SAm[((sa_uint_t)SA[i + 2]) >> 1] < 0; f3 += SAm[((sa_uint_t)SA[i + 3]) >> 1] < 0; } for (j += prefetch_distance + 3; i < j; i += 1) { f0 += SAm[((sa_uint_t)SA[i]) >> 1] < 0; } return f0 + f1 + f2 + f3; } #endif static sa_sint_t libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t f = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais_count_unique_suffixes(SA, m, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } if (omp_thread_num == omp_num_threads - 1) { f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); } libsais_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); } } #endif } return f; } static void libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072 && m < fs) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; if (omp_num_threads == 1) { fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs; libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size; libsais_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &thread_state[omp_thread_num].state.position, &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t, position; for (position = m, t = omp_num_threads - 1; t >= 0; --t) { fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1); fast_sint_t count = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end - thread_state[t].state.position); if (count > 0) { position -= count; memcpy(&SA[position], &SA[thread_state[t].state.position], (size_t)count * sizeof(sa_sint_t)); } } for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0; --t) { fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1); fast_sint_t count = ((fast_sint_t)m + omp_block_end - thread_state[t].state.count); if (count > 0) { position -= count; memcpy(&SA[position], &SA[thread_state[t].state.count], (size_t)count * sizeof(sa_sint_t)); } } } } #endif } memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t)); } static sa_sint_t libsais_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t f = libsais_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state); libsais_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state); return f; } static void libsais_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l]; sa_sint_t i, j; fast_sint_t tmp = *SAnm++; for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j; i += 4) { libsais_prefetchr(&T[i + prefetch_distance]); sa_sint_t c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & SAINT_MAX; SA[tmp] = i + 0; i++; tmp = *SAnm++; } sa_sint_t c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & SAINT_MAX; SA[tmp] = i + 1; i++; tmp = *SAnm++; } sa_sint_t c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & SAINT_MAX; SA[tmp] = i + 2; i++; tmp = *SAnm++; } sa_sint_t c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & SAINT_MAX; SA[tmp] = i + 3; i++; tmp = *SAnm++; } } for (j += 6; i < j; i += 1) { sa_sint_t c = T[i]; if (c < 0) { T[i] = c & SAINT_MAX; SA[tmp] = i; i++; tmp = *SAnm++; } } } static void libsais_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l]; fast_sint_t i, j; sa_sint_t tmp = *SAnm++; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) { libsais_prefetchr(&SA[i + prefetch_distance]); if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = *SAnm++; } if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = *SAnm++; } if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = *SAnm++; } if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = *SAnm++; } } for (j += 3; i < j; i += 1) { if (SA[i] == 0) { SA[i] = tmp; tmp = *SAnm++; } } } static void libsais_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais_count_negative_marked_suffixes(T, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } libsais_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start, omp_block_size); } } #endif } } static void libsais_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = f; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } libsais_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start, omp_block_size); } } #endif } } static void libsais_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state); libsais_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state); } static void libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (f > 0) { memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t)); libsais_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, local_buckets, threads, thread_state); libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads); memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t)); memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t)); libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state); } else { libsais_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads); } } static void libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (f > 0) { memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t)); libsais_gather_compacted_lms_suffixes_32s(T, SA, n); libsais_reconstruct_lms_suffixes_omp(SA, n, m - f, threads); memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t)); memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t)); libsais_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state); } else { libsais_gather_lms_suffixes_32s(T, SA, n); libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads); } } static sa_sint_t libsais_main_32s_recursion(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state, sa_sint_t * RESTRICT local_buffer) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); if (k > 0 && ((fs / k >= 6) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 6))) { sa_sint_t alignment = (fs - 1024) / k >= 6 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 6 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * (fast_sint_t)k]; buckets = (LIBSAIS_LOCAL_BUFFER_SIZE > fs) ? local_buffer : buckets; sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, buckets == local_buffer, threads, thread_state); if (m > 1) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); sa_sint_t first_lms_suffix = SA[n - m]; sa_sint_t left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix); libsais_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * (fast_sint_t)k], threads, thread_state); if ((n / 8192) < k) { libsais_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * (fast_sint_t)k], threads); } if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } libsais_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count); libsais_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state); sa_sint_t names = (n / 8192) < k ? libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state) : libsais_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state); if (names < m) { sa_sint_t f = (n / 8192) < k ? libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state) : 0; if (libsais_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, buckets == local_buffer, threads, thread_state); } else { libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); } libsais_initialize_buckets_start_and_end_32s_4k(k, buckets); libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state); } else { SA[0] = SA[n - 1]; libsais_initialize_buckets_start_and_end_32s_6k(k, buckets); libsais_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets); libsais_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state); } return 0; } else if (k > 0 && (n <= SAINT_MAX / 2) && ((fs / k >= 4) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 4))) { sa_sint_t alignment = (fs - 1024) / k >= 4 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 4 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * (fast_sint_t)k]; buckets = (LIBSAIS_LOCAL_BUFFER_SIZE > fs) ? local_buffer : buckets; sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, buckets == local_buffer, threads, thread_state); if (m > 1) { libsais_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]); libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state); libsais_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads); libsais_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets); libsais_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state); if (names < m) { sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); if (libsais_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, buckets == local_buffer, threads, thread_state); } else { libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); } } else { SA[0] = SA[n - 1]; } libsais_initialize_buckets_start_and_end_32s_4k(k, buckets); libsais_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); libsais_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state); return 0; } else if (k > 0 && ((fs / k >= 2) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 2))) { sa_sint_t alignment = (fs - 1024) / k >= 2 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais_align_up(&SA[n + fs - 2 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * (fast_sint_t)k]; buckets = (LIBSAIS_LOCAL_BUFFER_SIZE > fs) ? local_buffer : buckets; sa_sint_t m = libsais_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, buckets == local_buffer, threads, thread_state); if (m > 1) { libsais_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]); libsais_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state); libsais_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets); libsais_initialize_buckets_start_and_end_32s_2k(k, buckets); libsais_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads); if (names < m) { sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); if (libsais_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, buckets == local_buffer, threads, thread_state); } else { libsais_count_lms_suffixes_32s_2k(T, n, k, buckets); } } else { SA[0] = SA[n - 1]; } libsais_initialize_buckets_end_32s_2k(k, buckets); libsais_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets); libsais_initialize_buckets_start_and_end_32s_2k(k, buckets); libsais_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state); return 0; } else { sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL; sa_sint_t alignment = fs - 1024 >= k ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer; if (buckets == NULL) { return -2; } memset(SA, 0, (size_t)n * sizeof(sa_sint_t)); libsais_count_suffixes_32s(T, n, k, buckets); libsais_initialize_buckets_end_32s_1k(k, buckets); sa_sint_t m = libsais_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets); if (m > 1) { libsais_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state); sa_sint_t names = libsais_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads); if (names < m) { if (buffer != NULL) { libsais_free_aligned(buffer); buckets = NULL; } sa_sint_t f = libsais_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); if (libsais_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state); if (buckets == NULL) { buckets = buffer = (sa_sint_t *)libsais_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096); } if (buckets == NULL) { return -2; } } libsais_count_suffixes_32s(T, n, k, buckets); libsais_initialize_buckets_end_32s_1k(k, buckets); libsais_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets); } libsais_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state); libsais_free_aligned(buffer); return 0; } } static sa_sint_t libsais_main_32s_entry(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t local_buffer[2 * LIBSAIS_LOCAL_BUFFER_SIZE]; return libsais_main_32s_recursion(T, SA, n, k, fs, threads, thread_state, local_buffer + LIBSAIS_LOCAL_BUFFER_SIZE); } static sa_sint_t libsais_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t flags, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); sa_sint_t m = libsais_count_and_gather_lms_suffixes_8u_omp(T, SA, n, buckets, threads, thread_state); sa_sint_t k = libsais_initialize_buckets_start_and_end_8u(buckets, freq); if ((flags & LIBSAIS_FLAGS_GSA) && (buckets[0] != 0 || buckets[2] != 0 || buckets[3] != 1)) { return -1; } if (m > 0) { sa_sint_t first_lms_suffix = SA[n - m]; sa_sint_t left_suffixes_count = libsais_initialize_buckets_for_lms_suffixes_radix_sort_8u(T, buckets, first_lms_suffix); if (threads > 1 && n >= 65536) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); } libsais_radix_sort_lms_suffixes_8u_omp(T, SA, n, m, flags, buckets, threads, thread_state); if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } libsais_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix, left_suffixes_count); libsais_induce_partial_order_8u_omp(T, SA, n, k, flags, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state); sa_sint_t names = libsais_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state); if (names < m) { if (libsais_main_32s_entry(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) { return -2; } libsais_gather_lms_suffixes_8u_omp(T, SA, n, threads, thread_state); libsais_reconstruct_lms_suffixes_omp(SA, n, m, threads); } libsais_place_lms_suffixes_interval_8u(SA, n, m, flags, buckets); } else { memset(SA, 0, (size_t)n * sizeof(sa_sint_t)); } return libsais_induce_final_order_8u_omp(T, SA, n, k, flags, r, I, buckets, threads, thread_state); } static sa_sint_t libsais_main(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t flags, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL; sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais_alloc_aligned((size_t)8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1) ? libsais_main_8u(T, SA, n, buckets, flags, r, I, fs, freq, threads, thread_state) : -2; libsais_free_aligned(buckets); libsais_free_thread_state(thread_state); return index; } static sa_sint_t libsais_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais_alloc_thread_state(threads) : NULL; sa_sint_t index = thread_state != NULL || threads == 1 ? libsais_main_32s_entry(T, SA, n, k, fs, threads, thread_state) : -2; libsais_free_thread_state(thread_state); return index; } static sa_sint_t libsais_main_ctx(const LIBSAIS_CONTEXT * ctx, const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t flags, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq) { return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1)) ? libsais_main_8u(T, SA, n, ctx->buckets, flags, r, I, fs, freq, (sa_sint_t)ctx->threads, ctx->thread_state) : -2; } static void libsais_bwt_copy_8u(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) { libsais_prefetchr(&A[i + prefetch_distance]); U[i + 0] = (uint8_t)A[i + 0]; U[i + 1] = (uint8_t)A[i + 1]; U[i + 2] = (uint8_t)A[i + 2]; U[i + 3] = (uint8_t)A[i + 3]; U[i + 4] = (uint8_t)A[i + 4]; U[i + 5] = (uint8_t)A[i + 5]; U[i + 6] = (uint8_t)A[i + 6]; U[i + 7] = (uint8_t)A[i + 7]; } for (j += 7; i < j; i += 1) { U[i] = (uint8_t)A[i]; } } static void libsais_bwt_copy_8u_omp(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = ((fast_sint_t)n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)n - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)n; #endif libsais_bwt_copy_8u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size); } } void * libsais_create_ctx(void) { return (void *)libsais_create_ctx_main(1); } void libsais_free_ctx(void * ctx) { libsais_free_ctx_main((LIBSAIS_CONTEXT *)ctx); } int32_t libsais(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } return libsais_main(T, SA, n, LIBSAIS_FLAGS_NONE, 0, NULL, fs, freq, 1); } int32_t libsais_gsa(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq) { if ((T == NULL) || (SA == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } return libsais_main(T, SA, n, LIBSAIS_FLAGS_GSA, 0, NULL, fs, freq, 1); } int32_t libsais_int(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; } return libsais_main_int(T, SA, n, k, fs, 1); } int32_t libsais_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq) { if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, LIBSAIS_FLAGS_NONE, 0, NULL, fs, freq); } int32_t libsais_gsa_ctx(const void * ctx, const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq) { if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } return libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, LIBSAIS_FLAGS_GSA, 0, NULL, fs, freq); } int32_t libsais_bwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } return n; } sa_sint_t index = libsais_main(T, A, n, LIBSAIS_FLAGS_BWT, 0, NULL, fs, freq, 1); if (index >= 0) { index++; U[0] = T[n - 1]; libsais_bwt_copy_8u_omp(U + 1, A, index - 1, 1); libsais_bwt_copy_8u_omp(U + index, A + index, n - index, 1); } return index; } int32_t libsais_bwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } I[0] = n; return 0; } sa_sint_t index = libsais_main(T, A, n, LIBSAIS_FLAGS_BWT, r, I, fs, freq, 1); if (index == 0) { U[0] = T[n - 1]; libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, 1); libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0], 1); } return index; } int32_t libsais_bwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq) { if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } return n; } sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, LIBSAIS_FLAGS_BWT, 0, NULL, fs, freq); if (index >= 0) { index++; U[0] = T[n - 1]; libsais_bwt_copy_8u_omp(U + 1, A, index - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); libsais_bwt_copy_8u_omp(U + index, A + index, n - index, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); } return index; } int32_t libsais_bwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I) { if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } I[0] = n; return 0; } sa_sint_t index = libsais_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, LIBSAIS_FLAGS_BWT, r, I, fs, freq); if (index == 0) { U[0] = T[n - 1]; libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0], (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); } return index; } #if defined(LIBSAIS_OPENMP) void * libsais_create_ctx_omp(int32_t threads) { if (threads < 0) { return NULL; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return (void *)libsais_create_ctx_main(threads); } int32_t libsais_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return libsais_main(T, SA, n, LIBSAIS_FLAGS_NONE, 0, NULL, fs, freq, threads); } int32_t libsais_gsa_omp(const uint8_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads) { if ((T == NULL) || (SA == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return libsais_main(T, SA, n, LIBSAIS_FLAGS_GSA, 0, NULL, fs, freq, threads); } int32_t libsais_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return libsais_main_int(T, SA, n, k, fs, threads); } int32_t libsais_bwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } return n; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; sa_sint_t index = libsais_main(T, A, n, LIBSAIS_FLAGS_BWT, 0, NULL, fs, freq, threads); if (index >= 0) { index++; U[0] = T[n - 1]; libsais_bwt_copy_8u_omp(U + 1, A, index - 1, threads); libsais_bwt_copy_8u_omp(U + index, A + index, n - index, threads); } return index; } int32_t libsais_bwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } I[0] = n; return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; sa_sint_t index = libsais_main(T, A, n, LIBSAIS_FLAGS_BWT, r, I, fs, freq, threads); if (index == 0) { U[0] = T[n - 1]; libsais_bwt_copy_8u_omp(U + 1, A, I[0] - 1, threads); libsais_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0], threads); } return index; } #endif static LIBSAIS_UNBWT_CONTEXT * libsais_unbwt_create_ctx_main(sa_sint_t threads) { LIBSAIS_UNBWT_CONTEXT * RESTRICT ctx = (LIBSAIS_UNBWT_CONTEXT *)libsais_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64); sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096); uint16_t * RESTRICT fastbits = (uint16_t *)libsais_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(uint16_t), 4096); sa_uint_t * RESTRICT buckets = threads > 1 ? (sa_uint_t *)libsais_alloc_aligned((size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096) : NULL; if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1)) { ctx->bucket2 = bucket2; ctx->fastbits = fastbits; ctx->buckets = buckets; ctx->threads = threads; return ctx; } libsais_free_aligned(buckets); libsais_free_aligned(fastbits); libsais_free_aligned(bucket2); libsais_free_aligned(ctx); return NULL; } static void libsais_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx) { if (ctx != NULL) { libsais_free_aligned(ctx->buckets); libsais_free_aligned(ctx->fastbits); libsais_free_aligned(ctx->bucket2); libsais_free_aligned(ctx); } } static void libsais_unbwt_compute_histogram(const uint8_t * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count) { const fast_sint_t prefetch_distance = 256; const uint8_t * RESTRICT T_p = T; if (n >= 1024) { sa_uint_t copy[4 * (ALPHABET_SIZE + 16)]; memset(copy, 0, (size_t)4 * (ALPHABET_SIZE + 16) * sizeof(sa_uint_t)); sa_uint_t * RESTRICT copy0 = copy + 0 * (ALPHABET_SIZE + 16); sa_uint_t * RESTRICT copy1 = copy + 1 * (ALPHABET_SIZE + 16); sa_uint_t * RESTRICT copy2 = copy + 2 * (ALPHABET_SIZE + 16); sa_uint_t * RESTRICT copy3 = copy + 3 * (ALPHABET_SIZE + 16); for (; T_p < (uint8_t * )((ptrdiff_t)(T + 63) & (-64)); T_p += 1) { copy0[T_p[0]]++; } fast_uint_t x = ((const uint32_t *)(const void *)T_p)[0], y = ((const uint32_t *)(const void *)T_p)[1]; for (; T_p < (uint8_t * )((ptrdiff_t)(T + n - 8) & (-64)); T_p += 64) { libsais_prefetchr(&T_p[prefetch_distance]); fast_uint_t z = ((const uint32_t *)(const void *)T_p)[2], w = ((const uint32_t *)(const void *)T_p)[3]; copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; x = ((const uint32_t *)(const void *)T_p)[4]; y = ((const uint32_t *)(const void *)T_p)[5]; copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++; copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++; z = ((const uint32_t *)(const void *)T_p)[6]; w = ((const uint32_t *)(const void *)T_p)[7]; copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; x = ((const uint32_t *)(const void *)T_p)[8]; y = ((const uint32_t *)(const void *)T_p)[9]; copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++; copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++; z = ((const uint32_t *)(const void *)T_p)[10]; w = ((const uint32_t *)(const void *)T_p)[11]; copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; x = ((const uint32_t *)(const void *)T_p)[12]; y = ((const uint32_t *)(const void *)T_p)[13]; copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++; copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++; z = ((const uint32_t *)(const void *)T_p)[14]; w = ((const uint32_t *)(const void *)T_p)[15]; copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; x = ((const uint32_t *)(const void *)T_p)[16]; y = ((const uint32_t *)(const void *)T_p)[17]; copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++; copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++; } copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; T_p += 8; fast_uint_t i; for (i = 0; i < ALPHABET_SIZE; i++) { count[i] += copy0[i] + copy1[i] + copy2[i] + copy3[i]; } } for (; T_p < T + n; T_p += 1) { count[T_p[0]]++; } } static void libsais_unbwt_transpose_bucket2(sa_uint_t * RESTRICT bucket2) { fast_uint_t x, y, c, d; for (x = 0; x != ALPHABET_SIZE; x += 16) { for (c = x; c != x + 16; ++c) { for (d = c + 1; d != x + 16; ++d) { sa_uint_t tmp = bucket2[(d << 8) + c]; bucket2[(d << 8) + c] = bucket2[(c << 8) + d]; bucket2[(c << 8) + d] = tmp; } } for (y = x + 16; y != ALPHABET_SIZE; y += 16) { for (c = x; c != x + 16; ++c) { sa_uint_t * bucket2_yc = &bucket2[(y << 8) + c]; sa_uint_t * bucket2_cy = &bucket2[(c << 8) + y]; sa_uint_t tmp00 = bucket2_yc[ 0 * 256]; bucket2_yc[ 0 * 256] = bucket2_cy[ 0]; bucket2_cy[ 0] = tmp00; sa_uint_t tmp01 = bucket2_yc[ 1 * 256]; bucket2_yc[ 1 * 256] = bucket2_cy[ 1]; bucket2_cy[ 1] = tmp01; sa_uint_t tmp02 = bucket2_yc[ 2 * 256]; bucket2_yc[ 2 * 256] = bucket2_cy[ 2]; bucket2_cy[ 2] = tmp02; sa_uint_t tmp03 = bucket2_yc[ 3 * 256]; bucket2_yc[ 3 * 256] = bucket2_cy[ 3]; bucket2_cy[ 3] = tmp03; sa_uint_t tmp04 = bucket2_yc[ 4 * 256]; bucket2_yc[ 4 * 256] = bucket2_cy[ 4]; bucket2_cy[ 4] = tmp04; sa_uint_t tmp05 = bucket2_yc[ 5 * 256]; bucket2_yc[ 5 * 256] = bucket2_cy[ 5]; bucket2_cy[ 5] = tmp05; sa_uint_t tmp06 = bucket2_yc[ 6 * 256]; bucket2_yc[ 6 * 256] = bucket2_cy[ 6]; bucket2_cy[ 6] = tmp06; sa_uint_t tmp07 = bucket2_yc[ 7 * 256]; bucket2_yc[ 7 * 256] = bucket2_cy[ 7]; bucket2_cy[ 7] = tmp07; sa_uint_t tmp08 = bucket2_yc[ 8 * 256]; bucket2_yc[ 8 * 256] = bucket2_cy[ 8]; bucket2_cy[ 8] = tmp08; sa_uint_t tmp09 = bucket2_yc[ 9 * 256]; bucket2_yc[ 9 * 256] = bucket2_cy[ 9]; bucket2_cy[ 9] = tmp09; sa_uint_t tmp10 = bucket2_yc[10 * 256]; bucket2_yc[10 * 256] = bucket2_cy[10]; bucket2_cy[10] = tmp10; sa_uint_t tmp11 = bucket2_yc[11 * 256]; bucket2_yc[11 * 256] = bucket2_cy[11]; bucket2_cy[11] = tmp11; sa_uint_t tmp12 = bucket2_yc[12 * 256]; bucket2_yc[12 * 256] = bucket2_cy[12]; bucket2_cy[12] = tmp12; sa_uint_t tmp13 = bucket2_yc[13 * 256]; bucket2_yc[13 * 256] = bucket2_cy[13]; bucket2_cy[13] = tmp13; sa_uint_t tmp14 = bucket2_yc[14 * 256]; bucket2_yc[14 * 256] = bucket2_cy[14]; bucket2_cy[14] = tmp14; sa_uint_t tmp15 = bucket2_yc[15 * 256]; bucket2_yc[15 * 256] = bucket2_cy[15]; bucket2_cy[15] = tmp15; } } } } static void libsais_unbwt_compute_bigram_histogram_single(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_uint_t index) { fast_uint_t sum, c; for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) { fast_uint_t prev = sum; sum += bucket1[c]; bucket1[c] = (sa_uint_t)prev; if (prev != sum) { sa_uint_t * RESTRICT bucket2_p = &bucket2[c << 8]; { fast_uint_t hi = index; if (sum < hi) { hi = sum; } libsais_unbwt_compute_histogram(&T[prev], (fast_sint_t)(hi - prev), bucket2_p); } { fast_uint_t lo = index + 1; if (prev > lo) { lo = prev; } libsais_unbwt_compute_histogram(&T[lo - 1], (fast_sint_t)(sum - lo), bucket2_p); } } } libsais_unbwt_transpose_bucket2(bucket2); } static void libsais_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t lastc, fast_uint_t shift) { fast_uint_t v, w, sum, c, d; for (v = 0, w = 0, sum = 1, c = 0; c < ALPHABET_SIZE; ++c) { if (c == lastc) { sum += 1; } for (d = 0; d < ALPHABET_SIZE; ++d, ++w) { fast_uint_t prev = sum; sum += bucket2[w]; bucket2[w] = (sa_uint_t)prev; if (prev != sum) { for (; v <= ((sum - 1) >> shift); ++v) { fastbits[v] = (uint16_t)w; } } } } } static void libsais_unbwt_calculate_biPSI(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start, fast_sint_t omp_block_end) { { fast_sint_t i = omp_block_start, j = (fast_sint_t)index; if (omp_block_end < j) { j = omp_block_end; } for (; i < j; ++i) { fast_uint_t c = T[i]; fast_uint_t p = bucket1[c]++; fast_sint_t t = (fast_sint_t)(index - p); if (t != 0) { fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c; P[bucket2[w]++] = (sa_uint_t)i; } } } { fast_sint_t i = (fast_sint_t)index, j = omp_block_end; if (omp_block_start > i) { i = omp_block_start; } for (i += 1; i <= j; ++i) { fast_uint_t c = T[i - 1]; fast_uint_t p = bucket1[c]++; fast_sint_t t = (fast_sint_t)(index - p); if (t != 0) { fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c; P[bucket2[w]++] = (sa_uint_t)i; } } } } static void libsais_unbwt_init_single(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits) { sa_uint_t bucket1[ALPHABET_SIZE]; fast_uint_t index = I[0]; fast_uint_t lastc = T[0]; fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } if (freq != NULL) { memcpy(bucket1, freq, ALPHABET_SIZE * sizeof(sa_uint_t)); } else { memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); libsais_unbwt_compute_histogram(T, n, bucket1); } memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); libsais_unbwt_compute_bigram_histogram_single(T, bucket1, bucket2, index); libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift); libsais_unbwt_calculate_biPSI(T, P, bucket1, bucket2, index, 0, n); } #if defined(LIBSAIS_OPENMP) static void libsais_unbwt_compute_bigram_histogram_parallel(const uint8_t * RESTRICT T, fast_uint_t index, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { fast_uint_t c = T[i]; fast_uint_t p = bucket1[c]++; fast_sint_t t = (fast_sint_t)(index - p); if (t != 0) { fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c; bucket2[w]++; } } } static void libsais_unbwt_init_parallel(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads) { sa_uint_t bucket1[ALPHABET_SIZE]; fast_uint_t index = I[0]; fast_uint_t lastc = T[0]; fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) { fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); if (omp_num_threads == 1) { libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); } else { sa_uint_t * RESTRICT bucket1_local = buckets + omp_thread_num * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)); sa_uint_t * RESTRICT bucket2_local = bucket1_local + ALPHABET_SIZE; fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; { memset(bucket1_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); libsais_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket1_local); } #pragma omp barrier #pragma omp master { { sa_uint_t * RESTRICT bucket1_temp = buckets; fast_sint_t t; for (t = 0; t < omp_num_threads; ++t, bucket1_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) { fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket1[c], B = bucket1_temp[c]; bucket1[c] = A + B; bucket1_temp[c] = A; } } } { fast_uint_t sum, c; for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) { fast_uint_t prev = sum; sum += bucket1[c]; bucket1[c] = (sa_uint_t)prev; } } } #pragma omp barrier { fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket1[c], B = bucket1_local[c]; bucket1_local[c] = A + B; } memset(bucket2_local, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); libsais_unbwt_compute_bigram_histogram_parallel(T, index, bucket1_local, bucket2_local, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t omp_bucket2_stride = ((ALPHABET_SIZE * ALPHABET_SIZE) / omp_num_threads) & (-16); fast_sint_t omp_bucket2_start = omp_thread_num * omp_bucket2_stride; fast_sint_t omp_bucket2_size = omp_thread_num < omp_num_threads - 1 ? omp_bucket2_stride : (ALPHABET_SIZE * ALPHABET_SIZE) - omp_bucket2_start; sa_uint_t * RESTRICT bucket2_temp = buckets + ALPHABET_SIZE; fast_sint_t t; for (t = 0; t < omp_num_threads; ++t, bucket2_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) { fast_sint_t c; for (c = omp_bucket2_start; c < omp_bucket2_start + omp_bucket2_size; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_temp[c]; bucket2[c] = A + B; bucket2_temp[c] = A; } } } #pragma omp barrier #pragma omp master { libsais_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift); { fast_sint_t t; for (t = omp_num_threads - 1; t >= 1; --t) { sa_uint_t * RESTRICT dst_bucket1 = buckets + t * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)); sa_uint_t * RESTRICT src_bucket1 = dst_bucket1 - (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)); memcpy(dst_bucket1, src_bucket1, ALPHABET_SIZE * sizeof(sa_uint_t)); } memcpy(buckets, bucket1, ALPHABET_SIZE * sizeof(sa_uint_t)); } } #pragma omp barrier { fast_sint_t c; for (c = 0; c < ALPHABET_SIZE * ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_local[c]; bucket2_local[c] = A + B; } libsais_unbwt_calculate_biPSI(T, P, bucket1_local, bucket2_local, index, omp_block_start, omp_block_start + omp_block_size); } #pragma omp barrier #pragma omp master { memcpy(bucket2, buckets + ALPHABET_SIZE + (omp_num_threads - 1) * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)), ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); } } } } #endif static void libsais_unbwt_decode_1(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; fast_uint_t i, p0 = *i0; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); } *i0 = p0; } static void libsais_unbwt_decode_2(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); fast_uint_t i, p0 = *i0, p1 = *i1; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); } *i0 = p0; *i1 = p1; } static void libsais_unbwt_decode_3(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2); } *i0 = p0; *i1 = p1; *i2 = p2; } static void libsais_unbwt_decode_4(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2); uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3); } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; } static void libsais_unbwt_decode_5(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r); fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2); uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3); uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4); } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; } static void libsais_unbwt_decode_6(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r); uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r); fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2); uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3); uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4); uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5); } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; } static void libsais_unbwt_decode_7(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r); uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r); uint16_t * RESTRICT U6 = (uint16_t *)(void *)(((uint8_t *)U5) + r); fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2); uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3); uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4); uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5); uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = libsais_bswap16(c6); } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; } static void libsais_unbwt_decode_8(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r); uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r); uint16_t * RESTRICT U6 = (uint16_t *)(void *)(((uint8_t *)U5) + r); uint16_t * RESTRICT U7 = (uint16_t *)(void *)(((uint8_t *)U6) + r); fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais_bswap16(c1); uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais_bswap16(c2); uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais_bswap16(c3); uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais_bswap16(c4); uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais_bswap16(c5); uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = libsais_bswap16(c6); uint16_t c7 = fastbits[p7 >> shift]; if (bucket2[c7] <= p7) { do { c7++; } while (bucket2[c7] <= p7); } p7 = P[p7]; U7[i] = libsais_bswap16(c7); } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; *i7 = p7; } static void libsais_unbwt_decode(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t remainder) { fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } fast_uint_t offset = 0; while (blocks > 8) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7]; libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r >> 1); I += 8; blocks -= 8; offset += 8 * (fast_uint_t)r; } if (blocks == 1) { fast_uint_t i0 = I[0]; libsais_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, remainder >> 1); } else if (blocks == 2) { fast_uint_t i0 = I[0], i1 = I[1]; libsais_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, remainder >> 1); libsais_unbwt_decode_1(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, &i0, ((fast_uint_t)r >> 1) - (remainder >> 1)); } else if (blocks == 3) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2]; libsais_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, remainder >> 1); libsais_unbwt_decode_2(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r >> 1) - (remainder >> 1)); } else if (blocks == 4) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3]; libsais_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, remainder >> 1); libsais_unbwt_decode_3(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, ((fast_uint_t)r >> 1) - (remainder >> 1)); } else if (blocks == 5) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4]; libsais_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, remainder >> 1); libsais_unbwt_decode_4(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, ((fast_uint_t)r >> 1) - (remainder >> 1)); } else if (blocks == 6) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5]; libsais_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, remainder >> 1); libsais_unbwt_decode_5(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, ((fast_uint_t)r >> 1) - (remainder >> 1)); } else if (blocks == 7) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6]; libsais_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, remainder >> 1); libsais_unbwt_decode_6(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, ((fast_uint_t)r >> 1) - (remainder >> 1)); } else { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7]; libsais_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, remainder >> 1); libsais_unbwt_decode_7(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r >> 1) - (remainder >> 1)); } } static void libsais_unbwt_decode_omp(const uint8_t * RESTRICT T, uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_sint_t threads) { fast_uint_t lastc = T[0]; fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r); fast_uint_t remainder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1)); #if defined(LIBSAIS_OPENMP) fast_sint_t max_threads = blocks < threads ? blocks : threads; #pragma omp parallel num_threads(max_threads) if(max_threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = blocks / omp_num_threads; fast_sint_t omp_block_remainder = blocks % omp_num_threads; fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_remainder); fast_sint_t omp_block_start = omp_block_stride * omp_thread_num + (omp_thread_num < omp_block_remainder ? omp_thread_num : omp_block_remainder); libsais_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size, omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : remainder); } U[n - 1] = (uint8_t)lastc; } static sa_sint_t libsais_unbwt_core(const uint8_t * RESTRICT T, uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) if (threads > 1 && n >= 262144) { libsais_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads); } else #else UNUSED(buckets); #endif { libsais_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); } libsais_unbwt_decode_omp(T, U, P, n, r, I, bucket2, fastbits, threads); return 0; } static sa_sint_t libsais_unbwt_main(const uint8_t * T, uint8_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads) { fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096); uint16_t * RESTRICT fastbits = (uint16_t *)libsais_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096); sa_uint_t * RESTRICT buckets = threads > 1 && n >= 262144 ? (sa_uint_t *)libsais_alloc_aligned((size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096) : NULL; sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144) ? libsais_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads) : -2; libsais_free_aligned(buckets); libsais_free_aligned(fastbits); libsais_free_aligned(bucket2); return index; } static sa_sint_t libsais_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const uint8_t * T, uint8_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I) { return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL && (ctx->buckets != NULL || ctx->threads == 1) ? libsais_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits, ctx->buckets, (sa_sint_t)ctx->threads) : -2; } void * libsais_unbwt_create_ctx(void) { return (void *)libsais_unbwt_create_ctx_main(1); } void libsais_unbwt_free_ctx(void * ctx) { libsais_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx); } int32_t libsais_unbwt(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i) { return libsais_unbwt_aux(T, U, A, n, freq, n, &i); } int32_t libsais_unbwt_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i) { return libsais_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i); } int32_t libsais_unbwt_aux(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) { return -1; } else if (n <= 1) { if (I[0] != n) { return -1; } if (n == 1) { U[0] = T[0]; } return 0; } fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1); } int32_t libsais_unbwt_aux_ctx(const void * ctx, const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) { return -1; } else if (n <= 1) { if (I[0] != n) { return -1; } if (n == 1) { U[0] = T[0]; } return 0; } fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } return libsais_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I); } #if defined(LIBSAIS_OPENMP) void * libsais_unbwt_create_ctx_omp(int32_t threads) { if (threads < 0) { return NULL; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return (void *)libsais_unbwt_create_ctx_main(threads); } int32_t libsais_unbwt_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads) { return libsais_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads); } int32_t libsais_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0)) { return -1; } else if (n <= 1) { if (I[0] != n) { return -1; } if (n == 1) { U[0] = T[0]; } return 0; } fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return libsais_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads); } #endif static void libsais_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais_prefetchr(&SA[i + 2 * prefetch_distance]); libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]); libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]); PLCP[SA[i + 0]] = k; k = SA[i + 0]; PLCP[SA[i + 1]] = k; k = SA[i + 1]; libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 2]]); libsais_prefetchw(&PLCP[SA[i + prefetch_distance + 3]]); PLCP[SA[i + 2]] = k; k = SA[i + 2]; PLCP[SA[i + 3]] = k; k = SA[i + 3]; } for (j += prefetch_distance + 3; i < j; i += 1) { PLCP[SA[i]] = k; k = SA[i]; } } static void libsais_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size); } } static void libsais_compute_plcp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) { libsais_prefetchw(&PLCP[i + 2 * prefetch_distance]); libsais_prefetchr(&T[PLCP[i + prefetch_distance] + l]); fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); while (l < m && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } for (j += prefetch_distance; i < j; i += 1) { fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); while (l < m && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } } static void libsais_compute_plcp_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size); } } static void libsais_compute_plcp_gsa(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) { libsais_prefetchw(&PLCP[i + 2 * prefetch_distance]); libsais_prefetchr(&T[PLCP[i + prefetch_distance] + l]); fast_sint_t k = PLCP[i]; while (T[i + l] > 0 && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } for (j += prefetch_distance; i < j; i += 1) { fast_sint_t k = PLCP[i]; while (T[i + l] > 0 && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } } static void libsais_compute_plcp_gsa_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais_compute_plcp_gsa(T, PLCP, omp_block_start, omp_block_size); } } static void libsais_compute_plcp_int(const int32_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) { libsais_prefetchw(&PLCP[i + 2 * prefetch_distance]); libsais_prefetchr(&T[PLCP[i + prefetch_distance] + l]); fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); while (l < m && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } for (j += prefetch_distance; i < j; i += 1) { fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); while (l < m && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } } static void libsais_compute_plcp_int_omp(const int32_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais_compute_plcp_int(T, PLCP, n, omp_block_start, omp_block_size); } } static void libsais_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais_prefetchr(&SA[i + 2 * prefetch_distance]); libsais_prefetchw(&LCP[i + prefetch_distance]); libsais_prefetchr(&PLCP[SA[i + prefetch_distance + 0]]); libsais_prefetchr(&PLCP[SA[i + prefetch_distance + 1]]); LCP[i + 0] = PLCP[SA[i + 0]]; LCP[i + 1] = PLCP[SA[i + 1]]; libsais_prefetchr(&PLCP[SA[i + prefetch_distance + 2]]); libsais_prefetchr(&PLCP[SA[i + prefetch_distance + 3]]); LCP[i + 2] = PLCP[SA[i + 2]]; LCP[i + 3] = PLCP[SA[i + 3]]; } for (j += prefetch_distance + 3; i < j; i += 1) { LCP[i] = PLCP[SA[i]]; } } static void libsais_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size); } } int32_t libsais_plcp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } libsais_compute_phi_omp(SA, PLCP, n, 1); libsais_compute_plcp_omp(T, PLCP, n, 1); return 0; } int32_t libsais_plcp_gsa(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } libsais_compute_phi_omp(SA, PLCP, n, 1); libsais_compute_plcp_gsa_omp(T, PLCP, n, 1); return 0; } int32_t libsais_plcp_int(const int32_t * T, const int32_t * SA, int32_t * PLCP, int32_t n) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } libsais_compute_phi_omp(SA, PLCP, n, 1); libsais_compute_plcp_int_omp(T, PLCP, n, 1); return 0; } int32_t libsais_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n) { if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0)) { return -1; } else if (n <= 1) { if (n == 1) { LCP[0] = PLCP[SA[0]]; } return 0; } libsais_compute_lcp_omp(PLCP, SA, LCP, n, 1); return 0; } #if defined(LIBSAIS_OPENMP) int32_t libsais_plcp_omp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; libsais_compute_phi_omp(SA, PLCP, n, threads); libsais_compute_plcp_omp(T, PLCP, n, threads); return 0; } int32_t libsais_plcp_gsa_omp(const uint8_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; libsais_compute_phi_omp(SA, PLCP, n, threads); libsais_compute_plcp_gsa_omp(T, PLCP, n, threads); return 0; } int32_t libsais_plcp_int_omp(const int32_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; libsais_compute_phi_omp(SA, PLCP, n, threads); libsais_compute_plcp_int_omp(T, PLCP, n, threads); return 0; } int32_t libsais_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n, int32_t threads) { if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { LCP[0] = PLCP[SA[0]]; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; libsais_compute_lcp_omp(PLCP, SA, LCP, n, threads); return 0; } #endif ================================================ FILE: src/libsais16.c ================================================ /*-- This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2025 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Please see the file LICENSE for full copyright information. --*/ #include "libsais16.h" #include #include #include #include #include #if defined(LIBSAIS_OPENMP) #include #else #define UNUSED(_x) (void)(_x) #endif typedef int32_t sa_sint_t; typedef uint32_t sa_uint_t; typedef ptrdiff_t fast_sint_t; typedef size_t fast_uint_t; #define SAINT_BIT (32) #define SAINT_MAX INT32_MAX #define SAINT_MIN INT32_MIN #define ALPHABET_SIZE (1 << CHAR_BIT << CHAR_BIT) #define UNBWT_FASTBITS (17) #define SUFFIX_GROUP_BIT (SAINT_BIT - 1) #define SUFFIX_GROUP_MARKER (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1)) #define BUCKETS_INDEX2(_c, _s) (((_c) << 1) + (_s)) #define BUCKETS_INDEX4(_c, _s) (((_c) << 2) + (_s)) #define LIBSAIS_LOCAL_BUFFER_SIZE (2000) #define LIBSAIS_PER_THREAD_CACHE_SIZE (2097184) #define LIBSAIS_FLAGS_NONE (0) #define LIBSAIS_FLAGS_BWT (1) #define LIBSAIS_FLAGS_GSA (2) typedef struct LIBSAIS_THREAD_CACHE { sa_sint_t symbol; sa_sint_t index; } LIBSAIS_THREAD_CACHE; typedef union LIBSAIS_THREAD_STATE { struct { fast_sint_t position; fast_sint_t count; fast_sint_t m; fast_sint_t last_lms_suffix; sa_sint_t * buckets; LIBSAIS_THREAD_CACHE * cache; } state; uint8_t padding[64]; } LIBSAIS_THREAD_STATE; typedef struct LIBSAIS_CONTEXT { sa_sint_t * buckets; LIBSAIS_THREAD_STATE * thread_state; fast_sint_t threads; } LIBSAIS_CONTEXT; typedef struct LIBSAIS_UNBWT_CONTEXT { sa_uint_t * bucket2; uint16_t * fastbits; sa_uint_t * buckets; fast_sint_t threads; } LIBSAIS_UNBWT_CONTEXT; #if defined(__GNUC__) || defined(__clang__) #define RESTRICT __restrict__ #elif defined(_MSC_VER) || defined(__INTEL_COMPILER) #define RESTRICT __restrict #else #error Your compiler, configuration or platform is not supported. #endif #if defined(__has_builtin) #if __has_builtin(__builtin_prefetch) #define HAS_BUILTIN_PREFETCH #endif #elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4)) #define HAS_BUILTIN_PREFETCH #endif #if defined(HAS_BUILTIN_PREFETCH) #define libsais16_prefetchr(address) __builtin_prefetch((const void *)(address), 0, 3) #define libsais16_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 3) #elif defined (_M_IX86) || defined (_M_AMD64) #include #define libsais16_prefetchr(address) _mm_prefetch((const void *)(address), _MM_HINT_T0) #define libsais16_prefetchw(address) _m_prefetchw((const void *)(address)) #elif defined (_M_ARM) #include #define libsais16_prefetchr(address) __prefetch((const void *)(address)) #define libsais16_prefetchw(address) __prefetchw((const void *)(address)) #elif defined (_M_ARM64) #include #define libsais16_prefetchr(address) __prefetch2((const void *)(address), 0) #define libsais16_prefetchw(address) __prefetch2((const void *)(address), 16) #else #error Your compiler, configuration or platform is not supported. #endif #if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) #if defined(_LITTLE_ENDIAN) \ || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) \ || (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) \ || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) \ || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) #define __LITTLE_ENDIAN__ #elif defined(_BIG_ENDIAN) \ || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) \ || (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) \ || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) \ || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define __BIG_ENDIAN__ #elif defined(_WIN32) #define __LITTLE_ENDIAN__ #endif #endif static void * libsais16_align_up(const void * address, size_t alignment) { return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment))); } static void * libsais16_alloc_aligned(size_t size, size_t alignment) { void * address = malloc(size + sizeof(short) + alignment - 1); if (address != NULL) { void * aligned_address = libsais16_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment); ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address); return aligned_address; } return NULL; } static void libsais16_free_aligned(void * aligned_address) { if (aligned_address != NULL) { free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1])); } } static LIBSAIS_THREAD_STATE * libsais16_alloc_thread_state(sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = (LIBSAIS_THREAD_STATE *)libsais16_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096); sa_sint_t * RESTRICT thread_buckets = (sa_sint_t *)libsais16_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); LIBSAIS_THREAD_CACHE * RESTRICT thread_cache = (LIBSAIS_THREAD_CACHE *)libsais16_alloc_aligned((size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096); if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL) { fast_sint_t t; for (t = 0; t < threads; ++t) { thread_state[t].state.buckets = thread_buckets; thread_buckets += 4 * ALPHABET_SIZE; thread_state[t].state.cache = thread_cache; thread_cache += LIBSAIS_PER_THREAD_CACHE_SIZE; } return thread_state; } libsais16_free_aligned(thread_cache); libsais16_free_aligned(thread_buckets); libsais16_free_aligned(thread_state); return NULL; } static void libsais16_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) { if (thread_state != NULL) { libsais16_free_aligned(thread_state[0].state.cache); libsais16_free_aligned(thread_state[0].state.buckets); libsais16_free_aligned(thread_state); } } static LIBSAIS_CONTEXT * libsais16_create_ctx_main(sa_sint_t threads) { LIBSAIS_CONTEXT * RESTRICT ctx = (LIBSAIS_CONTEXT *)libsais16_alloc_aligned(sizeof(LIBSAIS_CONTEXT), 64); sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais16_alloc_aligned((size_t)8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais16_alloc_thread_state(threads) : NULL; if (ctx != NULL && buckets != NULL && (thread_state != NULL || threads == 1)) { ctx->buckets = buckets; ctx->threads = threads; ctx->thread_state = thread_state; return ctx; } libsais16_free_thread_state(thread_state); libsais16_free_aligned(buckets); libsais16_free_aligned(ctx); return NULL; } static void libsais16_free_ctx_main(LIBSAIS_CONTEXT * ctx) { if (ctx != NULL) { libsais16_free_thread_state(ctx->thread_state); libsais16_free_aligned(ctx->buckets); libsais16_free_aligned(ctx); } } #if defined(LIBSAIS_OPENMP) static sa_sint_t libsais16_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { sa_sint_t count = 0; fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] < 0); } return count; } static sa_sint_t libsais16_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { sa_sint_t count = 0; fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] == 0); } return count; } static void libsais16_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16_prefetchr(&cache[i + 2 * prefetch_distance]); libsais16_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]); libsais16_prefetchw(&SA[cache[i + prefetch_distance + 1].symbol]); libsais16_prefetchw(&SA[cache[i + prefetch_distance + 2].symbol]); libsais16_prefetchw(&SA[cache[i + prefetch_distance + 3].symbol]); SA[cache[i + 0].symbol] = cache[i + 0].index; SA[cache[i + 1].symbol] = cache[i + 1].index; SA[cache[i + 2].symbol] = cache[i + 2].index; SA[cache[i + 3].symbol] = cache[i + 3].index; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[cache[i].symbol] = cache[i].index; } } static void libsais16_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { libsais16_prefetchw(&cache[i + prefetch_distance]); cache[l] = cache[i + 0]; l += cache[l].symbol >= 0; cache[l] = cache[i + 1]; l += cache[l].symbol >= 0; cache[l] = cache[i + 2]; l += cache[l].symbol >= 0; cache[l] = cache[i + 3]; l += cache[l].symbol >= 0; } for (j += 3; i < j; i += 1) { cache[l] = cache[i]; l += cache[l].symbol >= 0; } libsais16_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start); } static void libsais16_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s]; } } static void libsais16_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s]; } } static void libsais16_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s]; } } static void libsais16_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s]; } } static void libsais16_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s]; } } static void libsais16_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s]; } } static void libsais16_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s]; } } static void libsais16_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride; sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s]; } } static void libsais16_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size, fast_sint_t bucket_stride, fast_sint_t num_buckets) { while (num_buckets >= 9) { libsais16_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size, bucket_stride); num_buckets -= 8; } switch (num_buckets) { case 1: break; case 2: libsais16_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride); break; case 3: libsais16_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride); break; case 4: libsais16_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride); break; case 5: libsais16_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride); break; case 6: libsais16_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride); break; case 7: libsais16_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride); break; case 8: libsais16_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride); break; default: break; } } #endif static void libsais16_flip_suffix_markers_omp(sa_sint_t * RESTRICT SA, sa_sint_t l, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && l >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (l / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : l - omp_block_start; fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { SA[i] ^= SAINT_MIN; } } } static void libsais16_gather_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 256; fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4) { libsais16_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); } for (j -= 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); } SA[m] = (sa_sint_t)(i + 1); } } static void libsais16_gather_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais16_gather_lms_suffixes_16u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t > omp_thread_num; --t) { m += thread_state[t].state.m; } libsais16_gather_lms_suffixes_16u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start, omp_block_size); #pragma omp barrier if (thread_state[omp_thread_num].state.m > 0) { SA[(fast_sint_t)n - 1 - m] = (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix; } } #endif } } static sa_sint_t libsais16_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; sa_sint_t i = n - 2; sa_sint_t m = n - 1; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= 3; i -= 4) { libsais16_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i + 1; m -= (sa_sint_t)(f1 & ~f0); c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 0; m -= (sa_sint_t)(f0 & ~f1); c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i - 1; m -= (sa_sint_t)(f1 & ~f0); c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 2; m -= (sa_sint_t)(f0 & ~f1); } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i + 1; m -= (sa_sint_t)(f0 & ~f1); } return n - 1 - m; } static sa_sint_t libsais16_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; sa_sint_t i = n - 2; sa_sint_t m = n - 1; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= 3; i -= 4) { libsais16_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i + 1; m -= (sa_sint_t)(f1 & ~f0 & (c0 >= 0)); c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 0; m -= (sa_sint_t)(f0 & ~f1 & (c1 >= 0)); c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i - 1; m -= (sa_sint_t)(f1 & ~f0 & (c0 >= 0)); c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 2; m -= (sa_sint_t)(f0 & ~f1 & (c1 >= 0)); } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i + 1; m -= (sa_sint_t)(f0 & ~f1 & (c1 >= 0)); } return n - 1 - m; } #if defined(LIBSAIS_OPENMP) static void libsais16_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t i = n - 2; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais16_prefetchr(&T[i - 2 * prefetch_distance]); libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0)]++; } #endif static void libsais16_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t i = n - 2; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais16_prefetchr(&T[i - 2 * prefetch_distance]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++; } #if defined(LIBSAIS_OPENMP) static void libsais16_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t i = n - 2; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais16_prefetchr(&T[i - 2 * prefetch_distance]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++; } #endif static sa_sint_t libsais16_count_and_gather_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 256; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4) { libsais16_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } for (j -= 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } static sa_sint_t libsais16_count_and_gather_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { m = libsais16_count_and_gather_lms_suffixes_16u(T, SA, n, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.m = libsais16_count_and_gather_lms_suffixes_16u(T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start, omp_block_size); if (thread_state[omp_thread_num].state.m > 0) { thread_state[omp_thread_num].state.last_lms_suffix = SA[thread_state[omp_thread_num].state.position - 1]; } } #pragma omp barrier #pragma omp master { memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { m += (sa_sint_t)thread_state[t].state.m; if (t != omp_num_threads - 1 && thread_state[t].state.m > 0) { memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.m], (size_t)thread_state[t].state.m * sizeof(sa_sint_t)); } { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t s; for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) { sa_sint_t A = buckets[s], B = temp_bucket[s]; buckets[s] = A + B; temp_bucket[s] = A; } } } } } #endif } return m; } static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais16_prefetchr(&T[i - 2 * prefetch_distance]); libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais16_prefetchr(&T[i - 2 * prefetch_distance]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } static sa_sint_t libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais16_prefetchr(&T[i - 2 * prefetch_distance]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]); libsais16_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0 & (c0 >= 0)); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1 & (c1 >= 0)); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0 & (c0 >= 0)); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1 & (c1 >= 0)); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1 & (c1 >= 0)); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0 & (c0 >= 0)); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } #if defined(LIBSAIS_OPENMP) static fast_sint_t libsais16_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size, fast_sint_t num_buckets) { fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024); if (free_space / (num_buckets - 1) >= bucket_size_1024) { return bucket_size_1024; } fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16); if (free_space / (num_buckets - 1) >= bucket_size_16) { return bucket_size_16; } return bucket_size; } static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(local_buckets); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { m = libsais16_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t bucket_size = 4 * (fast_sint_t)k; fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); fast_sint_t bucket_stride = libsais16_get_bucket_stride(free_space, bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = libsais16_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); } #pragma omp barrier if (omp_thread_num == omp_num_threads - 1) { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { m += (sa_sint_t)thread_state[t].state.count; if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) { memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } } } else { omp_num_threads = omp_num_threads - 1; omp_block_stride = (bucket_size / omp_num_threads) & (-16); omp_block_start = omp_thread_num * omp_block_stride; omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; libsais16_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1); } } #endif } return m; } static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(local_buckets); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { m = libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t bucket_size = 2 * (fast_sint_t)k; fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); fast_sint_t bucket_stride = libsais16_get_bucket_stride(free_space, bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); } #pragma omp barrier if (omp_thread_num == omp_num_threads - 1) { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { m += (sa_sint_t)thread_state[t].state.count; if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) { memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } } } else { omp_num_threads = omp_num_threads - 1; omp_block_stride = (bucket_size / omp_num_threads) & (-16); omp_block_start = omp_thread_num * omp_block_stride; omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; libsais16_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1); } } #endif } return m; } static void libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(local_buckets); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t bucket_size = 2 * (fast_sint_t)k; fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[(fast_sint_t)n + (fast_sint_t)n]); fast_sint_t bucket_stride = libsais16_get_bucket_stride(free_space, bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) { m += (sa_sint_t)thread_state[t].state.count; } if (thread_state[omp_thread_num].state.count > 0) { memcpy(&SA[n - m], &SA[n + thread_state[omp_thread_num].state.position - thread_state[omp_thread_num].state.count], (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t)); } } { omp_block_stride = (bucket_size / omp_num_threads) & (-16); omp_block_start = omp_thread_num * omp_block_stride; omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; libsais16_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads); } } #endif } } #endif static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { m = libsais16_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else if (omp_thread_num == 0) { libsais16_count_lms_suffixes_32s_4k(T, n, k, buckets); } else { m = libsais16_gather_lms_suffixes_32s(T, SA, n); } #endif } return m; } static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { m = libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else if (omp_thread_num == 0) { libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets); } else { m = libsais16_gather_lms_suffixes_32s(T, SA, n); } #endif } return m; } static sa_sint_t libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { m = libsais16_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else if (omp_thread_num == 0) { libsais16_count_compacted_lms_suffixes_32s_2k(T, n, k, buckets); } else { m = libsais16_gather_compacted_lms_suffixes_32s(T, SA, n); } #endif } return m; } static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m; #if defined(LIBSAIS_OPENMP) fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); sa_sint_t max_threads = (sa_sint_t)(free_space / ((4 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 16 / k) { max_threads = n / 16 / k; } m = libsais16_count_and_gather_lms_suffixes_32s_4k_fs_omp(T, SA, n, k, buckets, local_buckets, max_threads > 2 ? max_threads : 2, thread_state); } else #else UNUSED(local_buckets); UNUSED(thread_state); #endif { m = libsais16_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads); } return m; } static sa_sint_t libsais16_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m; #if defined(LIBSAIS_OPENMP) fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); sa_sint_t max_threads = (sa_sint_t)(free_space / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } m = libsais16_count_and_gather_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, local_buckets, max_threads > 2 ? max_threads : 2, thread_state); } else #else UNUSED(local_buckets); UNUSED(thread_state); #endif { m = libsais16_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); } return m; } static void libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[(fast_sint_t)n + (fast_sint_t)n]); sa_sint_t max_threads = (sa_sint_t)(free_space / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (!local_buckets && max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, local_buckets, max_threads > 2 ? max_threads : 2, thread_state); } else #else UNUSED(local_buckets); UNUSED(thread_state); #endif { libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); } } static void libsais16_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j; for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) { libsais16_prefetchr(&T[i + prefetch_distance]); buckets[T[i + 0]]++; buckets[T[i + 1]]++; buckets[T[i + 2]]++; buckets[T[i + 3]]++; buckets[T[i + 4]]++; buckets[T[i + 5]]++; buckets[T[i + 6]]++; buckets[T[i + 7]]++; } for (j += 7; i < j; i += 1) { buckets[T[i]]++; } } static sa_sint_t libsais16_initialize_buckets_start_and_end_16u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq) { sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE]; sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; fast_sint_t k = -1; if (freq != NULL) { fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { sa_sint_t total = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; bucket_start[j] = sum; sum += total; bucket_end[j] = sum; k = total > 0 ? j : k; freq[j] = total; } } else { fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { sa_sint_t total = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; bucket_start[j] = sum; sum += total; bucket_end[j] = sum; k = total > 0 ? j : k; } } return (sa_sint_t)(k + 1); } static void libsais16_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { sa_sint_t * RESTRICT bucket_start = &buckets[4 * (fast_sint_t)k]; sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { bucket_start[j] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; bucket_end[j] = sum; } } static void libsais16_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) { bucket_start[j] = sum; sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; bucket_end[j] = sum; } } static void libsais16_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i; sa_sint_t sum0 = 0; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) { sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; } } static void libsais16_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i, j; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) { buckets[j] = buckets[i]; } buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t)); } static void libsais16_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i; sa_sint_t sum = 0; for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sa_sint_t tmp = buckets[i]; buckets[i] = sum; sum += tmp; } } static void libsais16_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i; sa_sint_t sum = 0; for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; } } static sa_sint_t libsais16_initialize_buckets_for_lms_suffixes_radix_sort_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { { fast_uint_t f0 = 0; fast_uint_t f1 = 0; fast_sint_t c0 = T[first_lms_suffix]; fast_sint_t c1 = 0; for (; --first_lms_suffix >= 0; ) { c1 = c0; c0 = T[first_lms_suffix]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]--; } buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0)]--; } { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; } return sum; } } static void libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; fast_sint_t i; sa_sint_t sum0 = 0, sum1 = 0; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) { sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; sum1 += buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; buckets[i + BUCKETS_INDEX2(0, 1)] = sum1; } } static sa_sint_t libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { { fast_uint_t f0 = 0; fast_uint_t f1 = 0; fast_sint_t c0 = T[first_lms_suffix]; fast_sint_t c1 = 0; for (; --first_lms_suffix >= 0; ) { c1 = c0; c0 = T[first_lms_suffix]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]--; } buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0)]--; } { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; } return sum; } } static void libsais16_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; fast_sint_t i, j; sa_sint_t sum0 = 0, sum1 = 0; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) { bucket_start[j] = sum1; sum0 += buckets[i + BUCKETS_INDEX2(0, 1)]; sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 1)] = sum0; bucket_end[j] = sum1; } } static void libsais16_radix_sort_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais16_prefetchr(&SA[i - 2 * prefetch_distance]); libsais16_prefetchr(&T[SA[i - prefetch_distance - 0]]); libsais16_prefetchr(&T[SA[i - prefetch_distance - 1]]); libsais16_prefetchr(&T[SA[i - prefetch_distance - 2]]); libsais16_prefetchr(&T[SA[i - prefetch_distance - 3]]); sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; } } static void libsais16_radix_sort_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t flags, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (flags & LIBSAIS_FLAGS_GSA) { buckets[4 * ALPHABET_SIZE]--; } #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && m >= 65536 && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { libsais16_radix_sort_lms_suffixes_16u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); } #if defined(LIBSAIS_OPENMP) else { { sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets; fast_sint_t i, j; for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1); i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0)) { dst_bucket[i] = src_bucket[i] - dst_bucket[j]; } } { fast_sint_t t, omp_block_start = 0, omp_block_size = thread_state[omp_thread_num].state.m; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) omp_block_start += thread_state[t].state.m; if (omp_block_start == (fast_sint_t)m && omp_block_size > 0) { omp_block_start -= 1; omp_block_size -= 1; } libsais16_radix_sort_lms_suffixes_16u(T, SA, thread_state[omp_thread_num].state.buckets, (fast_sint_t)n - omp_block_start, omp_block_size); } } #endif } } static void libsais16_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) { libsais16_prefetchr(&SA[i - 3 * prefetch_distance]); libsais16_prefetchr(&T[SA[i - 2 * prefetch_distance - 0]]); libsais16_prefetchr(&T[SA[i - 2 * prefetch_distance - 1]]); libsais16_prefetchr(&T[SA[i - 2 * prefetch_distance - 2]]); libsais16_prefetchr(&T[SA[i - 2 * prefetch_distance - 3]]); libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]); libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]); libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]); libsais16_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]); sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[T[p0]]] = p0; sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[T[p1]]] = p1; sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[T[p2]]] = p2; sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[T[p3]]] = p3; } for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[--induction_bucket[T[p]]] = p; } } static void libsais16_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) { libsais16_prefetchr(&SA[i - 3 * prefetch_distance]); libsais16_prefetchr(&T[SA[i - 2 * prefetch_distance - 0]]); libsais16_prefetchr(&T[SA[i - 2 * prefetch_distance - 1]]); libsais16_prefetchr(&T[SA[i - 2 * prefetch_distance - 2]]); libsais16_prefetchr(&T[SA[i - 2 * prefetch_distance - 3]]); libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]); libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]); libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]); libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]); sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; } for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; } } #if defined(LIBSAIS_OPENMP) static void libsais16_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16_prefetchr(&T[SA[i + prefetch_distance + 0]]); libsais16_prefetchr(&T[SA[i + prefetch_distance + 1]]); libsais16_prefetchr(&T[SA[i + prefetch_distance + 2]]); libsais16_prefetchr(&T[SA[i + prefetch_distance + 3]]); libsais16_prefetchw(&cache[i + prefetch_distance]); cache[i + 0].symbol = T[cache[i + 0].index = SA[i + 0]]; cache[i + 1].symbol = T[cache[i + 1].index = SA[i + 1]]; cache[i + 2].symbol = T[cache[i + 2].index = SA[i + 2]]; cache[i + 3].symbol = T[cache[i + 3].index = SA[i + 3]]; } for (j += prefetch_distance + 3; i < j; i += 1) { cache[i].symbol = T[cache[i].index = SA[i]]; } } static void libsais16_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais16_prefetchw(&cache[i - 2 * prefetch_distance]); libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]); libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]); libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]); libsais16_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]); cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol]; cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol]; cache[i - 2].symbol = --induction_bucket[cache[i - 2].symbol]; cache[i - 3].symbol = --induction_bucket[cache[i - 3].symbol]; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { cache[i].symbol = --induction_bucket[cache[i].symbol]; } } static void libsais16_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais16_prefetchw(&cache[i - 2 * prefetch_distance]); libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]); libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]); libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]); libsais16_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]); cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)]; cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)]; cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)]; cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)]; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)]; } } static void libsais16_radix_sort_lms_suffixes_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais16_radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } static void libsais16_radix_sort_lms_suffixes_32s_2k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais16_radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static void libsais16_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || m < 65536) { libsais16_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; } libsais16_radix_sort_lms_suffixes_32s_6k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static void libsais16_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || m < 65536) { libsais16_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; } libsais16_radix_sort_lms_suffixes_32s_2k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static sa_sint_t libsais16_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; sa_sint_t i = n - 2; sa_sint_t m = 0; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; fast_sint_t c2 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais16_prefetchr(&T[i - 2 * prefetch_distance]); libsais16_prefetchw(&buckets[T[i - prefetch_distance - 0]]); libsais16_prefetchw(&buckets[T[i - prefetch_distance - 1]]); libsais16_prefetchw(&buckets[T[i - prefetch_distance - 2]]); libsais16_prefetchw(&buckets[T[i - prefetch_distance - 3]]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); if (f1 & ~f0) { SA[--buckets[c2 = c0]] = i + 1; m++; } c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); if (f0 & ~f1) { SA[--buckets[c2 = c1]] = i - 0; m++; } c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); if (f1 & ~f0) { SA[--buckets[c2 = c0]] = i - 1; m++; } c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); if (f0 & ~f1) { SA[--buckets[c2 = c1]] = i - 2; m++; } } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); if (f0 & ~f1) { SA[--buckets[c2 = c1]] = i + 1; m++; } } if (m > 1) { SA[buckets[c2]] = 0; } return m; } static void libsais16_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16_prefetchr(&induction_bucket[i + 2 * prefetch_distance]); libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]); libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]); libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 2]]); libsais16_prefetchw(&SA[induction_bucket[i + prefetch_distance + 3]]); SA[induction_bucket[i + 0]] |= SAINT_MIN; SA[induction_bucket[i + 1]] |= SAINT_MIN; SA[induction_bucket[i + 2]] |= SAINT_MIN; SA[induction_bucket[i + 3]] |= SAINT_MIN; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[induction_bucket[i]] |= SAINT_MIN; } } static void libsais16_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16_prefetchr(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]); libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]); libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]); libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]); libsais16_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]); SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER; SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER; SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= SUFFIX_GROUP_MARKER; SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER; } } static void libsais16_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)k - 1; #endif libsais16_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size); } } static void libsais16_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)k - 1; #endif libsais16_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size); } } static void libsais16_initialize_buckets_for_partial_sorting_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++; fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0; for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)]; sum1 += buckets[i + BUCKETS_INDEX4(0, 1)]; buckets[j + BUCKETS_INDEX2(0, 0)] = sum0; buckets[j + BUCKETS_INDEX2(0, 1)] = sum1; } } static void libsais16_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0; for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i != BUCKETS_INDEX4((fast_sint_t)first_lms_suffix, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)]; sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)]; sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)]; sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)]; buckets[i + BUCKETS_INDEX4(0, 0)] = sum0; buckets[i + BUCKETS_INDEX4(0, 1)] = sum2; buckets[i + BUCKETS_INDEX4(0, 2)] = 0; buckets[i + BUCKETS_INDEX4(0, 3)] = 0; sum0 += SS + SL; sum1 += LS; sum2 += LS + LL; temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1; } for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)]; sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)]; sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)]; sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)]; buckets[i + BUCKETS_INDEX4(0, 0)] = sum0; buckets[i + BUCKETS_INDEX4(0, 1)] = sum2; buckets[i + BUCKETS_INDEX4(0, 2)] = 0; buckets[i + BUCKETS_INDEX4(0, 3)] = 0; sum0 += SS + SL; sum1 += LS; sum2 += LS + LL; temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1; } } static sa_sint_t libsais16_partial_sorting_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais16_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); SA[induction_bucket[v0]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); SA[induction_bucket[v1]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); SA[induction_bucket[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } return d; } #if defined(LIBSAIS_OPENMP) static void libsais16_partial_sorting_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; memset(induction_bucket, 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); memset(distinct_names , 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais16_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = cache[count].index = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; sa_sint_t p1 = cache[count].index = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); induction_bucket[v]++; distinct_names[v] = d; } state[0].state.position = (fast_sint_t)d - 1; state[0].state.count = count; } static void libsais16_partial_sorting_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { libsais16_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; SA[induction_bucket[v0]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; SA[induction_bucket[v1]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j += 1; i < j; i += 1) { sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; SA[induction_bucket[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } static sa_sint_t libsais16_partial_sorting_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16_partial_sorting_scan_left_to_right_16u(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16_partial_sorting_scan_left_to_right_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); } #pragma omp barrier #pragma omp master { sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; fast_sint_t c; for (c = 0; c < 2 * k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A + B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * k; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; } } #pragma omp barrier { libsais16_partial_sorting_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); } } #endif } return d; } #endif static sa_sint_t libsais16_partial_sorting_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN; distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; if (threads == 1 || left_suffixes_count < 65536) { d = libsais16_partial_sorting_scan_left_to_right_16u(T, SA, buckets, d, 0, left_suffixes_count); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < left_suffixes_count; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > left_suffixes_count) { block_max_end = left_suffixes_count;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); SA[induction_bucket[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } else { d = libsais16_partial_sorting_scan_left_to_right_16u_block_omp(T, SA, k, buckets, d, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif return d; } static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchr(&SA[i + 3 * prefetch_distance]); libsais16_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2); libsais16_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais16_prefetchw(&buckets[v0]); sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais16_prefetchw(&buckets[v1]); sa_sint_t p2 = SA[i + 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]); SA[buckets[v2]++] = (p2 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d; sa_sint_t p3 = SA[i + 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]); SA[buckets[v3]++] = (p3 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d; } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); SA[buckets[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; } return d; } static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 3 * prefetch_distance]); sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais16_prefetchr(Ts0 - 1); libsais16_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais16_prefetchr(Ts1 - 1); libsais16_prefetchr(Ts1 - 2); sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts2]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts3]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; } } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); SA[induction_bucket[T[p - 1]]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; } } return d; } static void libsais16_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 3 * prefetch_distance]); sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais16_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais16_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); } } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static void libsais16_partial_sorting_scan_left_to_right_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais16_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); libsais16_prefetchw(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); } cache[i + 0].symbol = symbol0; sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); } cache[i].symbol = symbol; } } static void libsais16_partial_sorting_scan_left_to_right_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais16_prefetchr(Ts0 - 1); libsais16_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais16_prefetchr(Ts1 - 1); libsais16_prefetchr(Ts1 - 2); libsais16_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX; } } static void libsais16_partial_sorting_scan_left_to_right_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais16_prefetchr(Ts0 - 1); libsais16_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais16_prefetchr(Ts1 - 1); libsais16_prefetchr(Ts1 - 2); libsais16_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX; } } static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&cache[i + 2 * prefetch_distance]); libsais16_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]); libsais16_prefetchw(&buckets[cache[i + prefetch_distance + 1].symbol]); sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index; d += (p0 < 0); cache[i + 0].symbol = buckets[v0]++; cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t s = cache[i + 0].symbol, q = (cache[s].index = cache[i + 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index; d += (p1 < 0); cache[i + 1].symbol = buckets[v1]++; cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t s = cache[i + 1].symbol, q = (cache[s].index = cache[i + 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = buckets[v]++; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; if (cache[i].symbol < omp_block_end) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } } return d; } static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&cache[i + 2 * prefetch_distance]); sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 >> 1 : 0]; libsais16_prefetchw(Is0); const sa_sint_t * Ds0 = &distinct_names[s0 > 0 ? s0 : 0]; libsais16_prefetchw(Ds0); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 >> 1 : 0]; libsais16_prefetchw(Is1); const sa_sint_t * Ds1 = &distinct_names[s1 > 0 ? s1 : 0]; libsais16_prefetchw(Ds1); sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { sa_sint_t p0 = cache[i + 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 0].symbol = induction_bucket[v0 >> 1]++; cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 0].index = np & SAINT_MAX; } } sa_sint_t v1 = cache[i + 1].symbol; if (v1 >= 0) { sa_sint_t p1 = cache[i + 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 1].symbol = induction_bucket[v1 >> 1]++; cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 1].index = np & SAINT_MAX; } } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = induction_bucket[v >> 1]++; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i].index = np & SAINT_MAX; } } } return d; } static void libsais16_partial_sorting_scan_left_to_right_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&cache[i + 2 * prefetch_distance]); sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais16_prefetchw(Is0); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais16_prefetchw(Is1); sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { cache[i + 0].symbol = induction_bucket[v0]++; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 0].index = np & SAINT_MAX; } } sa_sint_t v1 = cache[i + 1].symbol; if (v1 >= 0) { cache[i + 1].symbol = induction_bucket[v1]++; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 1].index = np & SAINT_MAX; } } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = induction_bucket[v]++; if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i].index = np & SAINT_MAX; } } } } static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16_partial_sorting_scan_left_to_right_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais16_partial_sorting_scan_left_to_right_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16_partial_sorting_scan_left_to_right_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais16_partial_sorting_scan_left_to_right_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static void libsais16_partial_sorting_scan_left_to_right_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16_partial_sorting_scan_left_to_right_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais16_partial_sorting_scan_left_to_right_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN; buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; if (threads == 1 || left_suffixes_count < 65536) { d = libsais16_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < left_suffixes_count; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > left_suffixes_count) { block_end = left_suffixes_count; } d = libsais16_partial_sorting_scan_left_to_right_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif return d; } static sa_sint_t libsais16_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; SA[induction_bucket[T[n - 1]]++] = (n - 1) | (sa_sint_t)((sa_uint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER; distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d; if (threads == 1 || n < 65536) { d = libsais16_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < n; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } d = libsais16_partial_sorting_scan_left_to_right_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif return d; } static void libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[buckets[T[n - 1]]++] = (n - 1) | (sa_sint_t)((sa_uint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais16_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < n; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } libsais16_partial_sorting_scan_left_to_right_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static void libsais16_partial_sorting_shift_markers_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets, sa_sint_t threads) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; fast_sint_t c; #if defined(LIBSAIS_OPENMP) #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536) #else UNUSED(threads); UNUSED(n); #endif for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0)) { fast_sint_t i, j; sa_sint_t s = SAINT_MIN; for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4) { libsais16_prefetchw(&SA[i - prefetch_distance]); sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; } for (j -= 3; i >= j; i -= 1) { sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; } } } static void libsais16_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets, sa_sint_t threads) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t c; #if defined(LIBSAIS_OPENMP) #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && k >= 65536) #else UNUSED(threads); #endif for (c = (fast_sint_t)k - 1; c >= 1; c -= 1) { fast_sint_t i, j; sa_sint_t s = SAINT_MIN; for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1, j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3; i >= j; i -= 4) { libsais16_prefetchw(&SA[i - prefetch_distance]); sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; } for (j -= 3; i >= j; i -= 1) { sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; } } } static void libsais16_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; fast_sint_t i; sa_sint_t s = SUFFIX_GROUP_MARKER; for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) { libsais16_prefetchw(&SA[i - prefetch_distance]); sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0; sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1; sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2; sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3; } for (; i >= 0; i -= 1) { sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q; } } static void libsais16_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) { buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)]; buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)]; } } static sa_sint_t libsais16_partial_sorting_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchr(&SA[i - 2 * prefetch_distance]); libsais16_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); libsais16_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } return d; } static sa_sint_t libsais16_partial_gsa_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchr(&SA[i - 2 * prefetch_distance]); libsais16_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); libsais16_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); if (v0 != 1) { SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); if (v1 != 1) { SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); if (v != 1) { SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } return d; } #if defined(LIBSAIS_OPENMP) static void libsais16_partial_sorting_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; memset(induction_bucket, 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); memset(distinct_names , 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchr(&SA[i - 2 * prefetch_distance]); libsais16_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); libsais16_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = cache[count].index = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; sa_sint_t p1 = cache[count].index = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); induction_bucket[v]++; distinct_names[v] = d; } state[0].state.position = (fast_sint_t)d - 1; state[0].state.count = count; } static void libsais16_partial_sorting_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { libsais16_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j += 1; i < j; i += 1) { sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } static void libsais16_partial_gsa_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { libsais16_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; if (v0 != 1) { SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; if (v1 != 1) { SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } } for (j += 1; i < j; i += 1) { sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; if (v != 1) { SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } } static sa_sint_t libsais16_partial_sorting_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16_partial_sorting_scan_right_to_left_16u(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16_partial_sorting_scan_right_to_left_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); } #pragma omp barrier #pragma omp master { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; fast_sint_t c; for (c = 0; c < 2 * k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * k; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; } } #pragma omp barrier { libsais16_partial_sorting_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); } } #endif } return d; } static sa_sint_t libsais16_partial_gsa_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16_partial_gsa_scan_right_to_left_16u(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16_partial_sorting_scan_right_to_left_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); } #pragma omp barrier #pragma omp master { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; fast_sint_t c; for (c = 0; c < 2 * k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * k; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; } } #pragma omp barrier { libsais16_partial_gsa_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); } } #endif } return d; } #endif static void libsais16_partial_sorting_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; if (threads == 1 || (scan_end - scan_start) < 65536) { libsais16_partial_sorting_scan_right_to_left_16u(T, SA, buckets, d, scan_start, scan_end - scan_start); } #if defined(LIBSAIS_OPENMP) else { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t block_start; for (block_start = scan_end - 1; block_start >= scan_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } else { d = libsais16_partial_sorting_scan_right_to_left_16u_block_omp(T, SA, k, buckets, d, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16_partial_gsa_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; if (threads == 1 || (scan_end - scan_start) < 65536) { libsais16_partial_gsa_scan_right_to_left_16u(T, SA, buckets, d, scan_start, scan_end - scan_start); } #if defined(LIBSAIS_OPENMP) else { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t block_start; for (block_start = scan_end - 1; block_start >= scan_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); if (v != 1) { SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } } else { d = libsais16_partial_gsa_scan_right_to_left_16u_block_omp(T, SA, k, buckets, d, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchr(&SA[i - 3 * prefetch_distance]); libsais16_prefetchr(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2); libsais16_prefetchr(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais16_prefetchw(&buckets[v0]); sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais16_prefetchw(&buckets[v1]); sa_sint_t p2 = SA[i - 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]); SA[--buckets[v2]] = (p2 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d; sa_sint_t p3 = SA[i - 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]); SA[--buckets[v3]] = (p3 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d; } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); SA[--buckets[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; } return d; } static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&SA[i - 3 * prefetch_distance]); sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais16_prefetchr(Ts0 - 1); libsais16_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais16_prefetchr(Ts1 - 1); libsais16_prefetchr(Ts1 - 2); sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts2]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16_prefetchw(&induction_bucket[Ts3]); libsais16_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; } } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); SA[--induction_bucket[T[p - 1]]] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; } } return d; } static void libsais16_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&SA[i - 3 * prefetch_distance]); sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais16_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais16_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); } } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static void libsais16_partial_sorting_scan_right_to_left_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais16_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais16_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); libsais16_prefetchw(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0; sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol; } } static void libsais16_partial_sorting_scan_right_to_left_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais16_prefetchr(Ts0 - 1); libsais16_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais16_prefetchr(Ts1 - 1); libsais16_prefetchr(Ts1 - 2); libsais16_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol; } } static void libsais16_partial_sorting_scan_right_to_left_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais16_prefetchr(Ts0 - 1); libsais16_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais16_prefetchr(Ts1 - 1); libsais16_prefetchr(Ts1 - 2); libsais16_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; } cache[i].symbol = symbol; } } static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&cache[i - 2 * prefetch_distance]); libsais16_prefetchw(&buckets[cache[i - prefetch_distance - 0].symbol]); libsais16_prefetchw(&buckets[cache[i - prefetch_distance - 1].symbol]); sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index; d += (p0 < 0); cache[i - 0].symbol = --buckets[v0]; cache[i - 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t s = cache[i - 0].symbol, q = (cache[s].index = cache[i - 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index; d += (p1 < 0); cache[i - 1].symbol = --buckets[v1]; cache[i - 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t s = cache[i - 1].symbol, q = (cache[s].index = cache[i - 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = --buckets[v]; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; if (cache[i].symbol >= omp_block_start) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } } return d; } static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&cache[i - 2 * prefetch_distance]); sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 >> 1 : 0]; libsais16_prefetchw(Is0); const sa_sint_t * Ds0 = &distinct_names[s0 > 0 ? s0 : 0]; libsais16_prefetchw(Ds0); sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 >> 1 : 0]; libsais16_prefetchw(Is1); const sa_sint_t * Ds1 = &distinct_names[s1 > 0 ? s1 : 0]; libsais16_prefetchw(Ds1); sa_sint_t v0 = cache[i - 0].symbol; if (v0 >= 0) { sa_sint_t p0 = cache[i - 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 0].symbol = --induction_bucket[v0 >> 1]; cache[i - 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } } sa_sint_t v1 = cache[i - 1].symbol; if (v1 >= 0) { sa_sint_t p1 = cache[i - 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 1].symbol = --induction_bucket[v1 >> 1]; cache[i - 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = --induction_bucket[v >> 1]; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } } } return d; } static void libsais16_partial_sorting_scan_right_to_left_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&cache[i - 2 * prefetch_distance]); sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais16_prefetchw(Is0); sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais16_prefetchw(Is1); sa_sint_t v0 = cache[i - 0].symbol; if (v0 >= 0) { cache[i - 0].symbol = --induction_bucket[v0]; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } } } sa_sint_t v1 = cache[i - 1].symbol; if (v1 >= 0) { cache[i - 1].symbol = --induction_bucket[v1]; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; }} } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = --induction_bucket[v]; if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } } } } } static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16_partial_sorting_scan_right_to_left_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais16_partial_sorting_scan_right_to_left_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16_partial_sorting_scan_right_to_left_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais16_partial_sorting_scan_right_to_left_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static void libsais16_partial_sorting_scan_right_to_left_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16_partial_sorting_scan_right_to_left_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais16_partial_sorting_scan_right_to_left_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; if (threads == 1 || (scan_end - scan_start) < 65536) { d = libsais16_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < scan_start) { block_end = scan_start - 1; } d = libsais16_partial_sorting_scan_right_to_left_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif return d; } static sa_sint_t libsais16_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { d = libsais16_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } d = libsais16_partial_sorting_scan_right_to_left_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif return d; } static void libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { libsais16_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } libsais16_partial_sorting_scan_right_to_left_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif } static fast_sint_t libsais16_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { libsais16_prefetchr(&SA[i + prefetch_distance]); sa_uint_t s0 = (sa_uint_t)SA[i + 0]; SA[l] = (sa_sint_t)((s0 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s0 < 0); sa_uint_t s1 = (sa_uint_t)SA[i + 1]; SA[l] = (sa_sint_t)((s1 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s1 < 0); sa_uint_t s2 = (sa_uint_t)SA[i + 2]; SA[l] = (sa_sint_t)((s2 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s2 < 0); sa_uint_t s3 = (sa_uint_t)SA[i + 3]; SA[l] = (sa_sint_t)((s3 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s3 < 0); } for (j += 3; i < j; i += 1) { sa_uint_t s = (sa_uint_t)SA[i]; SA[l] = (sa_sint_t)((s - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s < 0); } return l; } static fast_sint_t libsais16_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { libsais16_prefetchr(&SA[i + prefetch_distance]); sa_sint_t s0 = SA[i + 0]; SA[l] = s0 & SAINT_MAX; l += (s0 < 0); sa_sint_t s1 = SA[i + 1]; SA[l] = s1 & SAINT_MAX; l += (s1 < 0); sa_sint_t s2 = SA[i + 2]; SA[l] = s2 & SAINT_MAX; l += (s2 < 0); sa_sint_t s3 = SA[i + 3]; SA[l] = s3 & SAINT_MAX; l += (s3 < 0); } for (j += 3; i < j; i += 1) { sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l += (s < 0); } return l; } static void libsais16_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais16_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = omp_block_start; thread_state[omp_thread_num].state.count = libsais16_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size) - omp_block_start; } #pragma omp barrier #pragma omp master { fast_sint_t t, position = 0; for (t = 0; t < omp_num_threads; ++t) { if (t > 0 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } position += thread_state[t].state.count; } } } #endif } } static void libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais16_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = omp_block_start; thread_state[omp_thread_num].state.count = libsais16_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size) - omp_block_start; } #pragma omp barrier #pragma omp master { fast_sint_t t, position = 0; for (t = 0; t < omp_num_threads; ++t) { if (t > 0 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } position += thread_state[t].state.count; } } } #endif } } static void libsais16_induce_partial_order_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t flags, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(&buckets[2 * ALPHABET_SIZE], 0, (size_t)2 * ALPHABET_SIZE * sizeof(sa_sint_t)); if (flags & LIBSAIS_FLAGS_GSA) { buckets[4 * ALPHABET_SIZE + BUCKETS_INDEX2(0, 1)] = buckets[4 * ALPHABET_SIZE + BUCKETS_INDEX2(1, 1)] - 1; libsais16_flip_suffix_markers_omp(SA, buckets[4 * ALPHABET_SIZE + BUCKETS_INDEX2(0, 1)], threads); } sa_sint_t d = libsais16_partial_sorting_scan_left_to_right_16u_omp(T, SA, n, k, buckets, left_suffixes_count, 0, threads, thread_state); libsais16_partial_sorting_shift_markers_16u_omp(SA, n, buckets, threads); if (flags & LIBSAIS_FLAGS_GSA) { libsais16_partial_gsa_scan_right_to_left_16u_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); if (T[first_lms_suffix] == 0) { memmove(&SA[1], &SA[0], (size_t)(buckets[BUCKETS_INDEX2(1, 1)] - 1) * sizeof(sa_sint_t)); SA[0] = first_lms_suffix | SAINT_MIN; } buckets[BUCKETS_INDEX2(0, 1)] = 0; } else { libsais16_partial_sorting_scan_right_to_left_16u_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); } } static void libsais16_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t d = libsais16_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state); libsais16_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads); libsais16_partial_sorting_shift_buckets_32s_6k(k, buckets); libsais16_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); } static void libsais16_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t d = libsais16_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state); libsais16_partial_sorting_shift_markers_32s_4k(SA, n); libsais16_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state); libsais16_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state); } static void libsais16_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); } static void libsais16_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16_count_suffixes_32s(T, n, k, buckets); libsais16_initialize_buckets_start_32s_1k(k, buckets); libsais16_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state); libsais16_count_suffixes_32s(T, n, k, buckets); libsais16_initialize_buckets_end_32s_1k(k, buckets); libsais16_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state); libsais16_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); } static sa_sint_t libsais16_renumber_lms_suffixes_16u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]); libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]); libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]); libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]); sa_sint_t p0 = SA[i + 0]; SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p0 < 0; sa_sint_t p1 = SA[i + 1]; SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p1 < 0; sa_sint_t p2 = SA[i + 2]; SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p2 < 0; sa_sint_t p3 = SA[i + 3]; SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p3 < 0; } for (j += prefetch_distance + 3; i < j; i += 1) { sa_sint_t p = SA[i]; SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p < 0; } return name; } static fast_sint_t libsais16_gather_marked_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; l -= 1; fast_sint_t i, j; for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) { libsais16_prefetchr(&SA[i - prefetch_distance]); sa_sint_t s0 = SA[i - 0]; SA[l] = s0 & SAINT_MAX; l -= s0 < 0; sa_sint_t s1 = SA[i - 1]; SA[l] = s1 & SAINT_MAX; l -= s1 < 0; sa_sint_t s2 = SA[i - 2]; SA[l] = s2 & SAINT_MAX; l -= s2 < 0; sa_sint_t s3 = SA[i - 3]; SA[l] = s3 & SAINT_MAX; l -= s3 < 0; } for (j -= 3; i >= j; i -= 1) { sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l -= s < 0; } l += 1; return l; } static sa_sint_t libsais16_renumber_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t name = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { name = libsais16_renumber_lms_suffixes_16u(SA, m, 0, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } if (omp_thread_num == omp_num_threads - 1) { name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); } libsais16_renumber_lms_suffixes_16u(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); } } #endif } return name; } static void libsais16_gather_marked_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; if (omp_num_threads == 1) { libsais16_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { if (omp_thread_num < omp_num_threads - 1) { thread_state[omp_thread_num].state.position = libsais16_gather_marked_lms_suffixes(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size); thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position; } else { thread_state[omp_thread_num].state.position = libsais16_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size); thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position; } } #pragma omp barrier #pragma omp master { fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs; for (t = omp_num_threads - 1; t >= 0; --t) { position -= thread_state[t].state.count; if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } } } } #endif } } static sa_sint_t libsais16_renumber_and_gather_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t)); sa_sint_t name = libsais16_renumber_lms_suffixes_16u_omp(SA, m, threads, thread_state); if (name < m) { libsais16_gather_marked_lms_suffixes_omp(SA, n, m, fs, threads, thread_state); } else { fast_sint_t i; for (i = 0; i < m; i += 1) { SA[i] &= SAINT_MAX; } } return name; } static sa_sint_t libsais16_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]); libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]); libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]); libsais16_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]); p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN); name += p0 < 0; p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN); name += p1 < 0; p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN); name += p2 < 0; p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0; } for (j += prefetch_distance + 3; i < j; i += 1) { p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0; } return name; } static void libsais16_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0; for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j; i += 4) { libsais16_prefetchw(&SA[i + prefetch_distance]); p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | SAINT_MAX); p0 = (p0 == 0) ? p3 : p0; p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | SAINT_MAX); p1 = (p1 == 0) ? p0 : p1; p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | SAINT_MAX); p2 = (p2 == 0) ? p1 : p2; p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3; } for (j += 3; i < j; i += 1) { p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3; } } static void libsais16_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) { libsais16_prefetchw(&SAm[i + prefetch_distance]); SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX; SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX; SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX; SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX; } for (j += 3; i < j; i += 1) { SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX; } } static sa_sint_t libsais16_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t name = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { name = libsais16_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 1; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } if (omp_thread_num == omp_num_threads - 1) { name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); } libsais16_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); } } #endif } return name - 1; } static void libsais16_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)n >> 1; #endif libsais16_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size); } } static void libsais16_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)n >> 1; #endif libsais16_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size); } } static sa_sint_t libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t)); sa_sint_t name = libsais16_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state); if (name < m) { libsais16_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads); } return name; } static sa_sint_t libsais16_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; { libsais16_gather_lms_suffixes_32s(T, SA, n); memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t)); fast_sint_t i, j; for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4) { libsais16_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN; SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN; SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN; SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN; } for (j += prefetch_distance + 3; i < j; i += 1) { SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN; } SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN; } { libsais16_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads); } sa_sint_t name = 1; { fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1]; sa_sint_t pdiff = SAINT_MIN; for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais16_prefetchr(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]); libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais16_prefetchr(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]); fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN; if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN; } SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = SAINT_MIN; if (qlen == plen) { fast_sint_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (sa_sint_t)(l - plen) & SAINT_MIN; } SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0); } for (j += prefetch_distance + 1; i < j; i += 1) { fast_sint_t q = SA[i], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN; if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (sa_sint_t)(l - plen) & SAINT_MIN; } SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); p = q; plen = qlen; pdiff = qdiff; } SAm[p >> 1] = name | pdiff; name++; } if (name <= m) { libsais16_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads); } return name - 1; } static void libsais16_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT SAnm = &SA[n - m]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); libsais16_prefetchr(&SAnm[SA[i + prefetch_distance + 0]]); libsais16_prefetchr(&SAnm[SA[i + prefetch_distance + 1]]); libsais16_prefetchr(&SAnm[SA[i + prefetch_distance + 2]]); libsais16_prefetchr(&SAnm[SA[i + prefetch_distance + 3]]); SA[i + 0] = SAnm[SA[i + 0]]; SA[i + 1] = SAnm[SA[i + 1]]; SA[i + 2] = SAnm[SA[i + 2]]; SA[i + 3] = SAnm[SA[i + 3]]; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[i] = SAnm[SA[i]]; } } static void libsais16_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = m; #endif libsais16_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size); } } static void libsais16_place_lms_suffixes_interval_16u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t flags, sa_sint_t * RESTRICT buckets) { if (flags & LIBSAIS_FLAGS_GSA) { buckets[7 * ALPHABET_SIZE]--; } { const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; fast_sint_t c, j = n; for (c = ALPHABET_SIZE - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } if (flags & LIBSAIS_FLAGS_GSA) { buckets[7 * ALPHABET_SIZE]++; } } static void libsais16_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais16_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { fast_sint_t j = n; if (k > 1) { fast_sint_t c; for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) { fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)]; if (l > 0) { fast_sint_t i = buckets[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais16_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; sa_sint_t c = k - 1; fast_sint_t i, l = buckets[c]; for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4) { libsais16_prefetchr(&SA[i - 2 * prefetch_distance]); libsais16_prefetchr(&T[SA[i - prefetch_distance - 0]]); libsais16_prefetchr(&T[SA[i - prefetch_distance - 1]]); libsais16_prefetchr(&T[SA[i - prefetch_distance - 2]]); libsais16_prefetchr(&T[SA[i - prefetch_distance - 3]]); sa_sint_t p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p0; sa_sint_t p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p1; sa_sint_t p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p2; sa_sint_t p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p3; } for (; i >= 0; i -= 1) { sa_sint_t p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p; } memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t)); } static void libsais16_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { const sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais16_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais16_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { fast_sint_t j = n; if (k > 1) { fast_sint_t c; for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) { fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)]; if (l > 0) { fast_sint_t i = buckets[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais16_final_bwt_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } static void libsais16_final_bwt_aux_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]]; }} sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]]; }} } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } } } } static void libsais16_final_sorting_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } static void libsais16_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 3 * prefetch_distance]); sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais16_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais16_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static fast_sint_t libsais16_final_bwt_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } return count; } static fast_sint_t libsais16_final_sorting_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } return count; } static void libsais16_final_order_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais16_prefetchr(&cache[i + prefetch_distance]); SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; } for (j += 3; i < j; i += 1) { SA[buckets[cache[i].symbol]++] = cache[i].index; } } static void libsais16_final_bwt_aux_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais16_prefetchr(&cache[i + prefetch_distance]); SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; if ((cache[i + 0].index & rm) == 0) { I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol]; } SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol]; } SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; if ((cache[i + 2].index & rm) == 0) { I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol]; } SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; if ((cache[i + 3].index & rm) == 0) { I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol]; } } for (j += 3; i < j; i += 1) { SA[buckets[cache[i].symbol]++] = cache[i].index; if ((cache[i].index & rm) == 0) { I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol]; } } } static void libsais16_final_sorting_scan_left_to_right_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais16_prefetchr(Ts0 - 1); libsais16_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais16_prefetchr(Ts1 - 1); libsais16_prefetchr(Ts1 - 2); libsais16_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; cache[i + 0].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; cache[i + 1].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; cache[i].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol; } } static void libsais16_final_sorting_scan_left_to_right_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&cache[i + 2 * prefetch_distance]); sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais16_prefetchw(Is0); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais16_prefetchw(Is1); sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { cache[i + 0].symbol = induction_bucket[v0]++; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; cache[i + 0].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } sa_sint_t v1 = cache[i + 1].symbol; if (v1 >= 0) { cache[i + 1].symbol = induction_bucket[v1]++; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; cache[i + 1].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = induction_bucket[v]++; if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } } static void libsais16_final_bwt_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16_final_bwt_scan_left_to_right_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16_final_bwt_scan_left_to_right_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16_final_order_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16_final_bwt_aux_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16_final_bwt_aux_scan_left_to_right_16u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16_final_bwt_scan_left_to_right_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16_final_bwt_aux_scan_left_to_right_16u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16_final_sorting_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16_final_sorting_scan_left_to_right_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16_final_sorting_scan_left_to_right_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16_final_order_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16_final_sorting_scan_left_to_right_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16_final_sorting_scan_left_to_right_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais16_final_sorting_scan_left_to_right_32s_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static void libsais16_final_bwt_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | (sa_sint_t)((sa_uint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais16_final_bwt_scan_left_to_right_16u(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < n; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } else { libsais16_final_bwt_scan_left_to_right_16u_block_omp(T, SA, k, induction_bucket, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16_final_bwt_aux_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | (sa_sint_t)((sa_uint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); if ((((sa_sint_t)n - 1) & rm) == 0) { I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]]; } if (threads == 1 || n < 65536) { libsais16_final_bwt_aux_scan_left_to_right_16u(T, SA, rm, I, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < n; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } } } } else { libsais16_final_bwt_aux_scan_left_to_right_16u_block_omp(T, SA, k, rm, I, induction_bucket, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16_final_sorting_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | (sa_sint_t)((sa_uint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais16_final_sorting_scan_left_to_right_16u(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < n; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } else { libsais16_final_sorting_scan_left_to_right_16u_block_omp(T, SA, k, induction_bucket, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[n - 1]]++] = (n - 1) | (sa_sint_t)((sa_uint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais16_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < n; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } libsais16_final_sorting_scan_left_to_right_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static sa_sint_t libsais16_final_bwt_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; sa_sint_t index = -1; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; index = (p0 == 0) ? (sa_sint_t)(i - 0) : index; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; } sa_sint_t p1 = SA[i - 1]; index = (p1 == 0) ? (sa_sint_t)(i - 1) : index; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; index = (p == 0) ? (sa_sint_t)i : index; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } } return index; } static void libsais16_final_bwt_aux_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1; } } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1; } } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } } } } static void libsais16_final_sorting_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } static void libsais16_final_gsa_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0 && T[p0 - 1] > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0 && T[p1 - 1] > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0 && T[p - 1] > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } static void libsais16_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&SA[i - 3 * prefetch_distance]); sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais16_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais16_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais16_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais16_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static fast_sint_t libsais16_final_bwt_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p0 : t; } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p1 : t; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p : t; } } return count; } static fast_sint_t libsais16_final_bwt_aux_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p0 : t; cache[count + 1].index = p0; count += 2; } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p1 : t; cache[count + 1].index = p1; count += 2; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p : t; cache[count + 1].index = p; count += 2; } } return count; } static fast_sint_t libsais16_final_sorting_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } return count; } static void libsais16_final_order_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais16_prefetchr(&cache[i + prefetch_distance]); SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index; SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index; } for (j += 3; i < j; i += 1) { SA[--buckets[cache[i].symbol]] = cache[i].index; } } static void libsais16_final_gsa_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais16_prefetchr(&cache[i + prefetch_distance]); if (cache[i + 0].symbol > 0) { SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; } if (cache[i + 1].symbol > 0) { SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index; } if (cache[i + 2].symbol > 0) { SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; } if (cache[i + 3].symbol > 0) { SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index; } } for (j += 3; i < j; i += 1) { if (cache[i].symbol > 0) { SA[--buckets[cache[i].symbol]] = cache[i].index; } } } static void libsais16_final_bwt_aux_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 6; i < j; i += 8) { libsais16_prefetchr(&cache[i + prefetch_distance]); SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; if ((cache[i + 1].index & rm) == 0) { I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1; } SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; if ((cache[i + 3].index & rm) == 0) { I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1; } SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index; if ((cache[i + 5].index & rm) == 0) { I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1; } SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index; if ((cache[i + 7].index & rm) == 0) { I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1; } } for (j += 6; i < j; i += 2) { SA[--buckets[cache[i].symbol]] = cache[i].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1; } } } static void libsais16_final_sorting_scan_right_to_left_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais16_prefetchr(Ts0 - 1); libsais16_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais16_prefetchr(Ts1 - 1); libsais16_prefetchr(Ts1 - 2); libsais16_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; cache[i + 0].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; cache[i + 1].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; cache[i].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol; } } static void libsais16_final_sorting_scan_right_to_left_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16_prefetchw(&cache[i - 2 * prefetch_distance]); sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais16_prefetchw(Is0); sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais16_prefetchw(Is1); sa_sint_t v0 = cache[i - 0].symbol; if (v0 >= 0) { cache[i - 0].symbol = --induction_bucket[v0]; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; cache[i - 0].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } sa_sint_t v1 = cache[i - 1].symbol; if (v1 >= 0) { cache[i - 1].symbol = --induction_bucket[v1]; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; cache[i - 1].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = --induction_bucket[v]; if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } } static void libsais16_final_bwt_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16_final_bwt_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16_final_bwt_scan_right_to_left_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16_final_order_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16_final_bwt_aux_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16_final_bwt_aux_scan_right_to_left_16u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16_final_bwt_aux_scan_right_to_left_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16_final_bwt_aux_scan_right_to_left_16u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16_final_sorting_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16_final_sorting_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16_final_sorting_scan_right_to_left_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16_final_order_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16_final_gsa_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16_final_gsa_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16_final_sorting_scan_right_to_left_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16_final_gsa_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16_final_sorting_scan_right_to_left_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16_final_sorting_scan_right_to_left_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais16_final_sorting_scan_right_to_left_32s_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static sa_sint_t libsais16_final_bwt_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t index = -1; if (threads == 1 || n < 65536) { index = libsais16_final_bwt_scan_right_to_left_16u(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) { if (SA[block_start] == 0) { index = (sa_sint_t)block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < 0) { block_max_end = -1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } } } else { libsais16_final_bwt_scan_right_to_left_16u_block_omp(T, SA, k, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif return index; } static void libsais16_final_bwt_aux_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { libsais16_final_bwt_aux_scan_right_to_left_16u(T, SA, rm, I, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2); if (block_max_end < 0) { block_max_end = -1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } } } } else { libsais16_final_bwt_aux_scan_right_to_left_16u_block_omp(T, SA, k, rm, I, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16_final_sorting_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || omp_block_size < 65536) { libsais16_final_sorting_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = omp_block_start + omp_block_size - 1; block_start >= omp_block_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < omp_block_start) { block_max_end = omp_block_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } else { libsais16_final_sorting_scan_right_to_left_16u_block_omp(T, SA, k, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16_final_gsa_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || omp_block_size < 65536) { libsais16_final_gsa_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = omp_block_start + omp_block_size - 1; block_start >= omp_block_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < omp_block_start) { block_max_end = omp_block_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0 && T[p - 1] > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } else { libsais16_final_gsa_scan_right_to_left_16u_block_omp(T, SA, k, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { libsais16_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } libsais16_final_sorting_scan_right_to_left_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif } static void libsais16_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end, sa_sint_t threads) { fast_sint_t c; #if defined(LIBSAIS_OPENMP) #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536) #else UNUSED(threads); UNUSED(n); #endif for (c = 0; c < k; ++c) { if (bucket_end[c] > bucket_start[c]) { memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t)); } } } static sa_sint_t libsais16_induce_final_order_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t flags, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if ((flags & LIBSAIS_FLAGS_BWT) == 0) { if (flags & LIBSAIS_FLAGS_GSA) { buckets[6 * ALPHABET_SIZE] = buckets[7 * ALPHABET_SIZE] - 1; } libsais16_final_sorting_scan_left_to_right_16u_omp(T, SA, n, k, &buckets[6 * ALPHABET_SIZE], threads, thread_state); if (threads > 1 && n >= 65536) { libsais16_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } if (flags & LIBSAIS_FLAGS_GSA) { libsais16_flip_suffix_markers_omp(SA, buckets[7 * ALPHABET_SIZE], threads); libsais16_final_gsa_scan_right_to_left_16u_omp(T, SA, buckets[7 * ALPHABET_SIZE], (fast_sint_t)n - buckets[7 * ALPHABET_SIZE], k, &buckets[7 * ALPHABET_SIZE], threads, thread_state); } else { libsais16_final_sorting_scan_right_to_left_16u_omp(T, SA, 0, n, k, &buckets[7 * ALPHABET_SIZE], threads, thread_state); } return 0; } else if (I != NULL) { libsais16_final_bwt_aux_scan_left_to_right_16u_omp(T, SA, n, k, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state); if (threads > 1 && n >= 65536) { libsais16_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } libsais16_final_bwt_aux_scan_right_to_left_16u_omp(T, SA, n, k, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state); return 0; } else { libsais16_final_bwt_scan_left_to_right_16u_omp(T, SA, n, k, &buckets[6 * ALPHABET_SIZE], threads, thread_state); if (threads > 1 && n >= 65536) { libsais16_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } return libsais16_final_bwt_scan_right_to_left_16u_omp(T, SA, n, k, &buckets[7 * ALPHABET_SIZE], threads, thread_state); } } static void libsais16_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * (fast_sint_t)k], threads, thread_state); libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * (fast_sint_t)k], threads, thread_state); } static void libsais16_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * (fast_sint_t)k], threads, thread_state); libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * (fast_sint_t)k], threads, thread_state); } static void libsais16_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); } static void libsais16_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16_count_suffixes_32s(T, n, k, buckets); libsais16_initialize_buckets_start_32s_1k(k, buckets); libsais16_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state); libsais16_count_suffixes_32s(T, n, k, buckets); libsais16_initialize_buckets_end_32s_1k(k, buckets); libsais16_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state); } static sa_sint_t libsais16_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; sa_sint_t i, j; for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3; i < j; i += 4) { libsais16_prefetchr(&SA[i + 3 * prefetch_distance]); libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]); libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]); libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]); libsais16_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]); sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0]; sa_sint_t * RESTRICT Tq0 = &T[q0]; libsais16_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : &SAm[q0 >> 1]); sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1]; sa_sint_t * RESTRICT Tq1 = &T[q1]; libsais16_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : &SAm[q1 >> 1]); sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2]; sa_sint_t * RESTRICT Tq2 = &T[q2]; libsais16_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : &SAm[q2 >> 1]); sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3]; sa_sint_t * RESTRICT Tq3 = &T[q3]; libsais16_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : &SAm[q3 >> 1]); sa_uint_t p0 = (sa_uint_t)SA[i + 0]; sa_sint_t s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= SAINT_MIN; f++; s0 = i + 0 + SAINT_MIN + f; } SAm[p0 >> 1] = s0 - f; sa_uint_t p1 = (sa_uint_t)SA[i + 1]; sa_sint_t s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= SAINT_MIN; f++; s1 = i + 1 + SAINT_MIN + f; } SAm[p1 >> 1] = s1 - f; sa_uint_t p2 = (sa_uint_t)SA[i + 2]; sa_sint_t s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= SAINT_MIN; f++; s2 = i + 2 + SAINT_MIN + f; } SAm[p2 >> 1] = s2 - f; sa_uint_t p3 = (sa_uint_t)SA[i + 3]; sa_sint_t s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= SAINT_MIN; f++; s3 = i + 3 + SAINT_MIN + f; } SAm[p3 >> 1] = s3 - f; } for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1) { sa_uint_t p = (sa_uint_t)SA[i]; sa_sint_t s = SAm[p >> 1]; if (s < 0) { T[p] |= SAINT_MIN; f++; s = i + SAINT_MIN + f; } SAm[p >> 1] = s - f; } return f; } static void libsais16_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_uint_t * RESTRICT SAl = (sa_uint_t *)&SA[0]; sa_uint_t * RESTRICT SAr = (sa_uint_t *)&SA[0]; fast_sint_t i, j, l = *pl - 1, r = *pr - 1; for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) { libsais16_prefetchr(&SA[i - prefetch_distance]); sa_uint_t p0 = (sa_uint_t)SA[i - 0]; SAl[l] = p0 & SAINT_MAX; l -= (sa_sint_t)p0 < 0; SAr[r] = p0 - 1; r -= (sa_sint_t)p0 > 0; sa_uint_t p1 = (sa_uint_t)SA[i - 1]; SAl[l] = p1 & SAINT_MAX; l -= (sa_sint_t)p1 < 0; SAr[r] = p1 - 1; r -= (sa_sint_t)p1 > 0; sa_uint_t p2 = (sa_uint_t)SA[i - 2]; SAl[l] = p2 & SAINT_MAX; l -= (sa_sint_t)p2 < 0; SAr[r] = p2 - 1; r -= (sa_sint_t)p2 > 0; sa_uint_t p3 = (sa_uint_t)SA[i - 3]; SAl[l] = p3 & SAINT_MAX; l -= (sa_sint_t)p3 < 0; SAr[r] = p3 - 1; r -= (sa_sint_t)p3 > 0; } for (j -= 3; i >= j; i -= 1) { sa_uint_t p = (sa_uint_t)SA[i]; SAl[l] = p & SAINT_MAX; l -= (sa_sint_t)p < 0; SAr[r] = p - 1; r -= (sa_sint_t)p > 0; } *pl = l + 1; *pr = r + 1; } #if defined(LIBSAIS_OPENMP) static sa_sint_t libsais16_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais16_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais16_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); libsais16_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); f0 += SAm[((sa_uint_t)SA[i + 0]) >> 1] < 0; f1 += SAm[((sa_uint_t)SA[i + 1]) >> 1] < 0; f2 += SAm[((sa_uint_t)SA[i + 2]) >> 1] < 0; f3 += SAm[((sa_uint_t)SA[i + 3]) >> 1] < 0; } for (j += prefetch_distance + 3; i < j; i += 1) { f0 += SAm[((sa_uint_t)SA[i]) >> 1] < 0; } return f0 + f1 + f2 + f3; } #endif static sa_sint_t libsais16_renumber_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t f = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { f = libsais16_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16_count_unique_suffixes(SA, m, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } if (omp_thread_num == omp_num_threads - 1) { f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); } libsais16_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); } } #endif } return f; } static void libsais16_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072 && m < fs) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; if (omp_num_threads == 1) { fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs; libsais16_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size; libsais16_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &thread_state[omp_thread_num].state.position, &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t, position; for (position = m, t = omp_num_threads - 1; t >= 0; --t) { fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1); fast_sint_t count = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end - thread_state[t].state.position); if (count > 0) { position -= count; memcpy(&SA[position], &SA[thread_state[t].state.position], (size_t)count * sizeof(sa_sint_t)); } } for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0; --t) { fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1); fast_sint_t count = ((fast_sint_t)m + omp_block_end - thread_state[t].state.count); if (count > 0) { position -= count; memcpy(&SA[position], &SA[thread_state[t].state.count], (size_t)count * sizeof(sa_sint_t)); } } } } #endif } memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t)); } static sa_sint_t libsais16_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t f = libsais16_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state); libsais16_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state); return f; } static void libsais16_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l]; sa_sint_t i, j; fast_sint_t tmp = *SAnm++; for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j; i += 4) { libsais16_prefetchr(&T[i + prefetch_distance]); sa_sint_t c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & SAINT_MAX; SA[tmp] = i + 0; i++; tmp = *SAnm++; } sa_sint_t c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & SAINT_MAX; SA[tmp] = i + 1; i++; tmp = *SAnm++; } sa_sint_t c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & SAINT_MAX; SA[tmp] = i + 2; i++; tmp = *SAnm++; } sa_sint_t c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & SAINT_MAX; SA[tmp] = i + 3; i++; tmp = *SAnm++; } } for (j += 6; i < j; i += 1) { sa_sint_t c = T[i]; if (c < 0) { T[i] = c & SAINT_MAX; SA[tmp] = i; i++; tmp = *SAnm++; } } } static void libsais16_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l]; fast_sint_t i, j; sa_sint_t tmp = *SAnm++; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) { libsais16_prefetchr(&SA[i + prefetch_distance]); if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = *SAnm++; } if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = *SAnm++; } if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = *SAnm++; } if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = *SAnm++; } } for (j += 3; i < j; i += 1) { if (SA[i] == 0) { SA[i] = tmp; tmp = *SAnm++; } } } static void libsais16_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais16_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16_count_negative_marked_suffixes(T, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } libsais16_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start, omp_block_size); } } #endif } } static void libsais16_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { libsais16_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = f; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } libsais16_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start, omp_block_size); } } #endif } } static void libsais16_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state); libsais16_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state); } static void libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (f > 0) { memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t)); libsais16_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, local_buckets, threads, thread_state); libsais16_reconstruct_lms_suffixes_omp(SA, n, m - f, threads); memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t)); memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t)); libsais16_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state); } else { libsais16_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); libsais16_reconstruct_lms_suffixes_omp(SA, n, m, threads); } } static void libsais16_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (f > 0) { memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t)); libsais16_gather_compacted_lms_suffixes_32s(T, SA, n); libsais16_reconstruct_lms_suffixes_omp(SA, n, m - f, threads); memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t)); memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t)); libsais16_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state); } else { libsais16_gather_lms_suffixes_32s(T, SA, n); libsais16_reconstruct_lms_suffixes_omp(SA, n, m, threads); } } static sa_sint_t libsais16_main_32s_recursion(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state, sa_sint_t * RESTRICT local_buffer) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); if (k > 0 && ((fs / k >= 6) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 6))) { sa_sint_t alignment = (fs - 1024) / k >= 6 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 6 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * (fast_sint_t)k]; buckets = (LIBSAIS_LOCAL_BUFFER_SIZE > fs) ? local_buffer : buckets; sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, buckets == local_buffer, threads, thread_state); if (m > 1) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); sa_sint_t first_lms_suffix = SA[n - m]; sa_sint_t left_suffixes_count = libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix); libsais16_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * (fast_sint_t)k], threads, thread_state); if ((n / 8192) < k) { libsais16_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * (fast_sint_t)k], threads); } if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } libsais16_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count); libsais16_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state); sa_sint_t names = (n / 8192) < k ? libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state) : libsais16_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state); if (names < m) { sa_sint_t f = (n / 8192) < k ? libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state) : 0; if (libsais16_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, buckets == local_buffer, threads, thread_state); } else { libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets); } libsais16_initialize_buckets_start_and_end_32s_4k(k, buckets); libsais16_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); libsais16_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state); } else { SA[0] = SA[n - 1]; libsais16_initialize_buckets_start_and_end_32s_6k(k, buckets); libsais16_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets); libsais16_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state); } return 0; } else if (k > 0 && (n <= SAINT_MAX / 2) && ((fs / k >= 4) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 4))) { sa_sint_t alignment = (fs - 1024) / k >= 4 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 4 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * (fast_sint_t)k]; buckets = (LIBSAIS_LOCAL_BUFFER_SIZE > fs) ? local_buffer : buckets; sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, buckets == local_buffer, threads, thread_state); if (m > 1) { libsais16_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]); libsais16_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state); libsais16_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads); libsais16_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets); libsais16_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state); if (names < m) { sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); if (libsais16_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, buckets == local_buffer, threads, thread_state); } else { libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets); } } else { SA[0] = SA[n - 1]; } libsais16_initialize_buckets_start_and_end_32s_4k(k, buckets); libsais16_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); libsais16_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state); return 0; } else if (k > 0 && ((fs / k >= 2) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 2))) { sa_sint_t alignment = (fs - 1024) / k >= 2 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - 2 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * (fast_sint_t)k]; buckets = (LIBSAIS_LOCAL_BUFFER_SIZE > fs) ? local_buffer : buckets; sa_sint_t m = libsais16_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, buckets == local_buffer, threads, thread_state); if (m > 1) { libsais16_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]); libsais16_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state); libsais16_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets); libsais16_initialize_buckets_start_and_end_32s_2k(k, buckets); libsais16_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads); if (names < m) { sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); if (libsais16_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais16_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, buckets == local_buffer, threads, thread_state); } else { libsais16_count_lms_suffixes_32s_2k(T, n, k, buckets); } } else { SA[0] = SA[n - 1]; } libsais16_initialize_buckets_end_32s_2k(k, buckets); libsais16_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets); libsais16_initialize_buckets_start_and_end_32s_2k(k, buckets); libsais16_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state); return 0; } else { sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais16_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL; sa_sint_t alignment = fs - 1024 >= k ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais16_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer; if (buckets == NULL) { return -2; } memset(SA, 0, (size_t)n * sizeof(sa_sint_t)); libsais16_count_suffixes_32s(T, n, k, buckets); libsais16_initialize_buckets_end_32s_1k(k, buckets); sa_sint_t m = libsais16_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets); if (m > 1) { libsais16_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state); sa_sint_t names = libsais16_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads); if (names < m) { if (buffer != NULL) { libsais16_free_aligned(buffer); buckets = NULL; } sa_sint_t f = libsais16_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); if (libsais16_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais16_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state); if (buckets == NULL) { buckets = buffer = (sa_sint_t *)libsais16_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096); } if (buckets == NULL) { return -2; } } libsais16_count_suffixes_32s(T, n, k, buckets); libsais16_initialize_buckets_end_32s_1k(k, buckets); libsais16_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets); } libsais16_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state); libsais16_free_aligned(buffer); return 0; } } static sa_sint_t libsais16_main_32s_entry(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t local_buffer[2 * LIBSAIS_LOCAL_BUFFER_SIZE]; return libsais16_main_32s_recursion(T, SA, n, k, fs, threads, thread_state, local_buffer + LIBSAIS_LOCAL_BUFFER_SIZE); } static sa_sint_t libsais16_main_16u(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t flags, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); sa_sint_t m = libsais16_count_and_gather_lms_suffixes_16u_omp(T, SA, n, buckets, threads, thread_state); sa_sint_t k = libsais16_initialize_buckets_start_and_end_16u(buckets, freq); if ((flags & LIBSAIS_FLAGS_GSA) && (buckets[0] != 0 || buckets[2] != 0 || buckets[3] != 1)) { return -1; } if (m > 0) { sa_sint_t first_lms_suffix = SA[n - m]; sa_sint_t left_suffixes_count = libsais16_initialize_buckets_for_lms_suffixes_radix_sort_16u(T, buckets, first_lms_suffix); if (threads > 1 && n >= 65536) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); } libsais16_radix_sort_lms_suffixes_16u_omp(T, SA, n, m, flags, buckets, threads, thread_state); if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } libsais16_initialize_buckets_for_partial_sorting_16u(T, buckets, first_lms_suffix, left_suffixes_count); libsais16_induce_partial_order_16u_omp(T, SA, n, k, flags, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state); sa_sint_t names = libsais16_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state); if (names < m) { if (libsais16_main_32s_entry(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) { return -2; } libsais16_gather_lms_suffixes_16u_omp(T, SA, n, threads, thread_state); libsais16_reconstruct_lms_suffixes_omp(SA, n, m, threads); } libsais16_place_lms_suffixes_interval_16u(SA, n, m, flags, buckets); } else { memset(SA, 0, (size_t)n * sizeof(sa_sint_t)); } return libsais16_induce_final_order_16u_omp(T, SA, n, k, flags, r, I, buckets, threads, thread_state); } static sa_sint_t libsais16_main(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t flags, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais16_alloc_thread_state(threads) : NULL; sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais16_alloc_aligned((size_t)8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1) ? libsais16_main_16u(T, SA, n, buckets, flags, r, I, fs, freq, threads, thread_state) : -2; libsais16_free_aligned(buckets); libsais16_free_thread_state(thread_state); return index; } static sa_sint_t libsais16_main_int(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais16_alloc_thread_state(threads) : NULL; sa_sint_t index = thread_state != NULL || threads == 1 ? libsais16_main_32s_entry(T, SA, n, k, fs, threads, thread_state) : -2; libsais16_free_thread_state(thread_state); return index; } static sa_sint_t libsais16_main_ctx(const LIBSAIS_CONTEXT * ctx, const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t flags, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq) { return ctx != NULL && (ctx->buckets != NULL && (ctx->thread_state != NULL || ctx->threads == 1)) ? libsais16_main_16u(T, SA, n, ctx->buckets, flags, r, I, fs, freq, (sa_sint_t)ctx->threads, ctx->thread_state) : -2; } static void libsais16_bwt_copy_16u(uint16_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) { libsais16_prefetchr(&A[i + prefetch_distance]); U[i + 0] = (uint16_t)A[i + 0]; U[i + 1] = (uint16_t)A[i + 1]; U[i + 2] = (uint16_t)A[i + 2]; U[i + 3] = (uint16_t)A[i + 3]; U[i + 4] = (uint16_t)A[i + 4]; U[i + 5] = (uint16_t)A[i + 5]; U[i + 6] = (uint16_t)A[i + 6]; U[i + 7] = (uint16_t)A[i + 7]; } for (j += 7; i < j; i += 1) { U[i] = (uint16_t)A[i]; } } static void libsais16_bwt_copy_16u_omp(uint16_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = ((fast_sint_t)n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)n - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)n; #endif libsais16_bwt_copy_16u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size); } } void * libsais16_create_ctx(void) { return (void *)libsais16_create_ctx_main(1); } void libsais16_free_ctx(void * ctx) { libsais16_free_ctx_main((LIBSAIS_CONTEXT *)ctx); } int32_t libsais16(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } return libsais16_main(T, SA, n, LIBSAIS_FLAGS_NONE, 0, NULL, fs, freq, 1); } int32_t libsais16_gsa(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq) { if ((T == NULL) || (SA == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } return libsais16_main(T, SA, n, LIBSAIS_FLAGS_GSA, 0, NULL, fs, freq, 1); } int32_t libsais16_int(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; } return libsais16_main_int(T, SA, n, k, fs, 1); } int32_t libsais16_ctx(const void * ctx, const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq) { if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } return libsais16_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, LIBSAIS_FLAGS_NONE, 0, NULL, fs, freq); } int32_t libsais16_gsa_ctx(const void * ctx, const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq) { if ((ctx == NULL) || (T == NULL) || (SA == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } return libsais16_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, SA, n, LIBSAIS_FLAGS_GSA, 0, NULL, fs, freq); } int32_t libsais16_bwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } return n; } sa_sint_t index = libsais16_main(T, A, n, LIBSAIS_FLAGS_BWT, 0, NULL, fs, freq, 1); if (index >= 0) { index++; U[0] = T[n - 1]; libsais16_bwt_copy_16u_omp(U + 1, A, index - 1, 1); libsais16_bwt_copy_16u_omp(U + index, A + index, n - index, 1); } return index; } int32_t libsais16_bwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } I[0] = n; return 0; } sa_sint_t index = libsais16_main(T, A, n, LIBSAIS_FLAGS_BWT, r, I, fs, freq, 1); if (index == 0) { U[0] = T[n - 1]; libsais16_bwt_copy_16u_omp(U + 1, A, I[0] - 1, 1); libsais16_bwt_copy_16u_omp(U + I[0], A + I[0], n - I[0], 1); } return index; } int32_t libsais16_bwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq) { if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } return n; } sa_sint_t index = libsais16_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, LIBSAIS_FLAGS_BWT, 0, NULL, fs, freq); if (index >= 0) { index++; U[0] = T[n - 1]; libsais16_bwt_copy_16u_omp(U + 1, A, index - 1, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); libsais16_bwt_copy_16u_omp(U + index, A + index, n - index, (sa_sint_t)((const LIBSAIS_CONTEXT *)ctx)->threads); } return index; } int32_t libsais16_bwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I) { if ((ctx == NULL) || (T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } I[0] = n; return 0; } sa_sint_t index = libsais16_main_ctx((const LIBSAIS_CONTEXT *)ctx, T, A, n, LIBSAIS_FLAGS_BWT, r, I, fs, freq); if (index == 0) { U[0] = T[n - 1]; libsais16_bwt_copy_16u_omp(U + 1, A, I[0] - 1, (sa_sint_t)((const LIBSAIS_CONTEXT*)ctx)->threads); libsais16_bwt_copy_16u_omp(U + I[0], A + I[0], n - I[0], (sa_sint_t)((const LIBSAIS_CONTEXT*)ctx)->threads); } return index; } #if defined(LIBSAIS_OPENMP) void * libsais16_create_ctx_omp(int32_t threads) { if (threads < 0) { return NULL; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return (void *)libsais16_create_ctx_main(threads); } int32_t libsais16_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return libsais16_main(T, SA, n, LIBSAIS_FLAGS_NONE, 0, NULL, fs, freq, threads); } int32_t libsais16_gsa_omp(const uint16_t * T, int32_t * SA, int32_t n, int32_t fs, int32_t * freq, int32_t threads) { if ((T == NULL) || (SA == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return libsais16_main(T, SA, n, LIBSAIS_FLAGS_GSA, 0, NULL, fs, freq, threads); } int32_t libsais16_int_omp(int32_t * T, int32_t * SA, int32_t n, int32_t k, int32_t fs, int32_t threads) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return libsais16_main_int(T, SA, n, k, fs, threads); } int32_t libsais16_bwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t threads) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } return n; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; sa_sint_t index = libsais16_main(T, A, n, LIBSAIS_FLAGS_BWT, 0, NULL, fs, freq, threads); if (index >= 0) { index++; U[0] = T[n - 1]; libsais16_bwt_copy_16u_omp(U + 1, A, index - 1, threads); libsais16_bwt_copy_16u_omp(U + index, A + index, n - index, threads); } return index; } int32_t libsais16_bwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, int32_t fs, int32_t * freq, int32_t r, int32_t * I, int32_t threads) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int32_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } I[0] = n; return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; sa_sint_t index = libsais16_main(T, A, n, LIBSAIS_FLAGS_BWT, r, I, fs, freq, threads); if (index == 0) { U[0] = T[n - 1]; libsais16_bwt_copy_16u_omp(U + 1, A, I[0] - 1, threads); libsais16_bwt_copy_16u_omp(U + I[0], A + I[0], n - I[0], threads); } return index; } #endif static LIBSAIS_UNBWT_CONTEXT * libsais16_unbwt_create_ctx_main(sa_sint_t threads) { LIBSAIS_UNBWT_CONTEXT * RESTRICT ctx = (LIBSAIS_UNBWT_CONTEXT *)libsais16_alloc_aligned(sizeof(LIBSAIS_UNBWT_CONTEXT), 64); sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais16_alloc_aligned(ALPHABET_SIZE * sizeof(sa_uint_t), 4096); uint16_t * RESTRICT fastbits = (uint16_t *)libsais16_alloc_aligned((1 + (1 << UNBWT_FASTBITS)) * sizeof(uint16_t), 4096); sa_uint_t * RESTRICT buckets = threads > 1 ? (sa_uint_t *)libsais16_alloc_aligned((size_t)threads * ALPHABET_SIZE * sizeof(sa_uint_t), 4096) : NULL; if (ctx != NULL && bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1)) { ctx->bucket2 = bucket2; ctx->fastbits = fastbits; ctx->buckets = buckets; ctx->threads = threads; return ctx; } libsais16_free_aligned(buckets); libsais16_free_aligned(fastbits); libsais16_free_aligned(bucket2); libsais16_free_aligned(ctx); return NULL; } static void libsais16_unbwt_free_ctx_main(LIBSAIS_UNBWT_CONTEXT * ctx) { if (ctx != NULL) { libsais16_free_aligned(ctx->buckets); libsais16_free_aligned(ctx->fastbits); libsais16_free_aligned(ctx->bucket2); libsais16_free_aligned(ctx); } } static void libsais16_unbwt_compute_histogram(const uint16_t * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count) { fast_sint_t i; for (i = 0; i < n; i += 1) { count[T[i]]++; } } static void libsais16_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift) { fast_uint_t v, w, sum; for (v = 0, sum = 1, w = 0; w < ALPHABET_SIZE; ++w) { fast_uint_t prev = sum; sum += bucket2[w]; bucket2[w] = (sa_uint_t)prev; if (prev != sum) { for (; v <= ((sum - 1) >> shift); ++v) { fastbits[v] = (uint16_t)w; } } } } static void libsais16_unbwt_calculate_P(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start, fast_sint_t omp_block_end) { { fast_sint_t i = omp_block_start, j = (fast_sint_t)index; if (omp_block_end < j) { j = omp_block_end; } for (; i < j; ++i) { fast_uint_t c = T[i]; P[bucket2[c]++] = (sa_uint_t)i; } } { fast_sint_t i = (fast_sint_t)index, j = omp_block_end; if (omp_block_start > i) { i = omp_block_start; } for (T -= 1, i += 1; i <= j; ++i) { fast_uint_t c = T[i]; P[bucket2[c]++] = (sa_uint_t)i; } } } static void libsais16_unbwt_init_single(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits) { fast_uint_t index = I[0]; fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } if (freq != NULL) { memcpy(bucket2, freq, ALPHABET_SIZE * sizeof(sa_uint_t)); } else { memset(bucket2, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); libsais16_unbwt_compute_histogram(T, n, bucket2); } libsais16_unbwt_calculate_fastbits(bucket2, fastbits, shift); libsais16_unbwt_calculate_P(T, P, bucket2, index, 0, n); } #if defined(LIBSAIS_OPENMP) static void libsais16_unbwt_init_parallel(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads) { fast_uint_t index = I[0]; fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) { fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); if (omp_num_threads == 1) { libsais16_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); } else { { sa_uint_t * RESTRICT bucket2_local = buckets + omp_thread_num * ALPHABET_SIZE; fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; memset(bucket2_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); libsais16_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket2_local); } #pragma omp barrier { sa_uint_t * RESTRICT bucket2_temp = buckets; fast_sint_t omp_block_stride = (ALPHABET_SIZE / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ALPHABET_SIZE - omp_block_start; memset(bucket2 + omp_block_start, 0, (size_t)omp_block_size * sizeof(sa_uint_t)); fast_sint_t t; for (t = 0; t < omp_num_threads; ++t, bucket2_temp += ALPHABET_SIZE) { fast_sint_t c; for (c = omp_block_start; c < omp_block_start + omp_block_size; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_temp[c]; bucket2[c] = A + B; bucket2_temp[c] = A; } } } #pragma omp barrier #pragma omp master { libsais16_unbwt_calculate_fastbits(bucket2, fastbits, shift); } #pragma omp barrier { sa_uint_t * RESTRICT bucket2_local = buckets + omp_thread_num * ALPHABET_SIZE; fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_local[c]; bucket2_local[c] = A + B; } libsais16_unbwt_calculate_P(T, P, bucket2_local, index, omp_block_start, omp_block_start + omp_block_size); } #pragma omp barrier #pragma omp master { memcpy(bucket2, buckets + (omp_num_threads - 1) * ALPHABET_SIZE, ALPHABET_SIZE * sizeof(sa_uint_t)); } } } } #endif static void libsais16_unbwt_decode_1(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k) { uint16_t * RESTRICT U0 = U; fast_uint_t i, p0 = *i0; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; } *i0 = p0; } static void libsais16_unbwt_decode_2(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; fast_uint_t i, p0 = *i0, p1 = *i1; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; } *i0 = p0; *i1 = p1; } static void libsais16_unbwt_decode_3(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; uint16_t * RESTRICT U2 = U1 + r; fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; } *i0 = p0; *i1 = p1; *i2 = p2; } static void libsais16_unbwt_decode_4(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; uint16_t * RESTRICT U2 = U1 + r; uint16_t * RESTRICT U3 = U2 + r; fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; } static void libsais16_unbwt_decode_5(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; uint16_t * RESTRICT U2 = U1 + r; uint16_t * RESTRICT U3 = U2 + r; uint16_t * RESTRICT U4 = U3 + r; fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4; } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; } static void libsais16_unbwt_decode_6(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; uint16_t * RESTRICT U2 = U1 + r; uint16_t * RESTRICT U3 = U2 + r; uint16_t * RESTRICT U4 = U3 + r; uint16_t * RESTRICT U5 = U4 + r; fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4; uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5; } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; } static void libsais16_unbwt_decode_7(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; uint16_t * RESTRICT U2 = U1 + r; uint16_t * RESTRICT U3 = U2 + r; uint16_t * RESTRICT U4 = U3 + r; uint16_t * RESTRICT U5 = U4 + r; uint16_t * RESTRICT U6 = U5 + r; fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4; uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5; uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = c6; } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; } static void libsais16_unbwt_decode_8(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; uint16_t * RESTRICT U2 = U1 + r; uint16_t * RESTRICT U3 = U2 + r; uint16_t * RESTRICT U4 = U3 + r; uint16_t * RESTRICT U5 = U4 + r; uint16_t * RESTRICT U6 = U5 + r; uint16_t * RESTRICT U7 = U6 + r; fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4; uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5; uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = c6; uint16_t c7 = fastbits[p7 >> shift]; if (bucket2[c7] <= p7) { do { c7++; } while (bucket2[c7] <= p7); } p7 = P[p7]; U7[i] = c7; } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; *i7 = p7; } static void libsais16_unbwt_decode(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t remainder) { fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } fast_uint_t offset = 0; while (blocks > 8) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7]; libsais16_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r); I += 8; blocks -= 8; offset += 8 * (fast_uint_t)r; } if (blocks == 1) { fast_uint_t i0 = I[0]; libsais16_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, remainder); } else if (blocks == 2) { fast_uint_t i0 = I[0], i1 = I[1]; libsais16_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, remainder); libsais16_unbwt_decode_1(U + offset + remainder, P, bucket2, fastbits, shift, &i0, ((fast_uint_t)r) - remainder); } else if (blocks == 3) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2]; libsais16_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, remainder); libsais16_unbwt_decode_2(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r) - remainder); } else if (blocks == 4) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3]; libsais16_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, remainder); libsais16_unbwt_decode_3(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, ((fast_uint_t)r) - remainder); } else if (blocks == 5) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4]; libsais16_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, remainder); libsais16_unbwt_decode_4(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, ((fast_uint_t)r) - remainder); } else if (blocks == 6) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5]; libsais16_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, remainder); libsais16_unbwt_decode_5(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, ((fast_uint_t)r) - remainder); } else if (blocks == 7) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6]; libsais16_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, remainder); libsais16_unbwt_decode_6(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, ((fast_uint_t)r) - remainder); } else { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7]; libsais16_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, remainder); libsais16_unbwt_decode_7(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r) - remainder); } } static void libsais16_unbwt_decode_omp(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_sint_t threads) { fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r); fast_uint_t remainder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1)); #if defined(LIBSAIS_OPENMP) fast_sint_t max_threads = blocks < threads ? blocks : threads; #pragma omp parallel num_threads(max_threads) if(max_threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = blocks / omp_num_threads; fast_sint_t omp_block_remainder = blocks % omp_num_threads; fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_remainder); fast_sint_t omp_block_start = omp_block_stride * omp_thread_num + (omp_thread_num < omp_block_remainder ? omp_thread_num : omp_block_remainder); libsais16_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size, omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : remainder); } } static sa_sint_t libsais16_unbwt_core(const uint16_t * RESTRICT T, uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) if (threads > 1 && n >= 262144) { libsais16_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads); } else #else UNUSED(buckets); #endif { libsais16_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); } libsais16_unbwt_decode_omp(U, P, n, r, I, bucket2, fastbits, threads); return 0; } static sa_sint_t libsais16_unbwt_main(const uint16_t * T, uint16_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads) { fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais16_alloc_aligned(ALPHABET_SIZE * sizeof(sa_uint_t), 4096); uint16_t * RESTRICT fastbits = (uint16_t *)libsais16_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096); sa_uint_t * RESTRICT buckets = threads > 1 && n >= 262144 ? (sa_uint_t *)libsais16_alloc_aligned((size_t)threads * ALPHABET_SIZE * sizeof(sa_uint_t), 4096) : NULL; sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144) ? libsais16_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads) : -2; libsais16_free_aligned(buckets); libsais16_free_aligned(fastbits); libsais16_free_aligned(bucket2); return index; } static sa_sint_t libsais16_unbwt_main_ctx(const LIBSAIS_UNBWT_CONTEXT * ctx, const uint16_t * T, uint16_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I) { return ctx != NULL && ctx->bucket2 != NULL && ctx->fastbits != NULL && (ctx->buckets != NULL || ctx->threads == 1) ? libsais16_unbwt_core(T, U, P, n, freq, r, I, ctx->bucket2, ctx->fastbits, ctx->buckets, (sa_sint_t)ctx->threads) : -2; } void * libsais16_unbwt_create_ctx(void) { return (void *)libsais16_unbwt_create_ctx_main(1); } void libsais16_unbwt_free_ctx(void * ctx) { libsais16_unbwt_free_ctx_main((LIBSAIS_UNBWT_CONTEXT *)ctx); } int32_t libsais16_unbwt(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i) { return libsais16_unbwt_aux(T, U, A, n, freq, n, &i); } int32_t libsais16_unbwt_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i) { return libsais16_unbwt_aux_ctx(ctx, T, U, A, n, freq, n, &i); } int32_t libsais16_unbwt_aux(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) { return -1; } else if (n <= 1) { if (I[0] != n) { return -1; } if (n == 1) { U[0] = T[0]; } return 0; } fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } return libsais16_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1); } int32_t libsais16_unbwt_aux_ctx(const void * ctx, const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) { return -1; } else if (n <= 1) { if (I[0] != n) { return -1; } if (n == 1) { U[0] = T[0]; } return 0; } fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } return libsais16_unbwt_main_ctx((const LIBSAIS_UNBWT_CONTEXT *)ctx, T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I); } #if defined(LIBSAIS_OPENMP) void * libsais16_unbwt_create_ctx_omp(int32_t threads) { if (threads < 0) { return NULL; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return (void *)libsais16_unbwt_create_ctx_main(threads); } int32_t libsais16_unbwt_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t i, int32_t threads) { return libsais16_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads); } int32_t libsais16_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int32_t * A, int32_t n, const int32_t * freq, int32_t r, const int32_t * I, int32_t threads) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0)) { return -1; } else if (n <= 1) { if (I[0] != n) { return -1; } if (n == 1) { U[0] = T[0]; } return 0; } fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return libsais16_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads); } #endif static void libsais16_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]); libsais16_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]); PLCP[SA[i + 0]] = k; k = SA[i + 0]; PLCP[SA[i + 1]] = k; k = SA[i + 1]; libsais16_prefetchw(&PLCP[SA[i + prefetch_distance + 2]]); libsais16_prefetchw(&PLCP[SA[i + prefetch_distance + 3]]); PLCP[SA[i + 2]] = k; k = SA[i + 2]; PLCP[SA[i + 3]] = k; k = SA[i + 3]; } for (j += prefetch_distance + 3; i < j; i += 1) { PLCP[SA[i]] = k; k = SA[i]; } } static void libsais16_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais16_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size); } } static void libsais16_compute_plcp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) { libsais16_prefetchw(&PLCP[i + 2 * prefetch_distance]); libsais16_prefetchr(&T[PLCP[i + prefetch_distance] + l]); fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); while (l < m && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } for (j += prefetch_distance; i < j; i += 1) { fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); while (l < m && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } } static void libsais16_compute_plcp_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais16_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size); } } static void libsais16_compute_plcp_gsa(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) { libsais16_prefetchw(&PLCP[i + 2 * prefetch_distance]); libsais16_prefetchr(&T[PLCP[i + prefetch_distance] + l]); fast_sint_t k = PLCP[i]; while (T[i + l] > 0 && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } for (j += prefetch_distance; i < j; i += 1) { fast_sint_t k = PLCP[i]; while (T[i + l] > 0 && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } } static void libsais16_compute_plcp_gsa_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais16_compute_plcp_gsa(T, PLCP, omp_block_start, omp_block_size); } } static void libsais16_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16_prefetchw(&LCP[i + prefetch_distance]); libsais16_prefetchr(&PLCP[SA[i + prefetch_distance + 0]]); libsais16_prefetchr(&PLCP[SA[i + prefetch_distance + 1]]); LCP[i + 0] = PLCP[SA[i + 0]]; LCP[i + 1] = PLCP[SA[i + 1]]; libsais16_prefetchr(&PLCP[SA[i + prefetch_distance + 2]]); libsais16_prefetchr(&PLCP[SA[i + prefetch_distance + 3]]); LCP[i + 2] = PLCP[SA[i + 2]]; LCP[i + 3] = PLCP[SA[i + 3]]; } for (j += prefetch_distance + 3; i < j; i += 1) { LCP[i] = PLCP[SA[i]]; } } static void libsais16_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais16_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size); } } int32_t libsais16_plcp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } libsais16_compute_phi_omp(SA, PLCP, n, 1); libsais16_compute_plcp_omp(T, PLCP, n, 1); return 0; } int32_t libsais16_plcp_gsa(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } libsais16_compute_phi_omp(SA, PLCP, n, 1); libsais16_compute_plcp_gsa_omp(T, PLCP, n, 1); return 0; } int32_t libsais16_lcp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n) { if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0)) { return -1; } else if (n <= 1) { if (n == 1) { LCP[0] = PLCP[SA[0]]; } return 0; } libsais16_compute_lcp_omp(PLCP, SA, LCP, n, 1); return 0; } #if defined(LIBSAIS_OPENMP) int32_t libsais16_plcp_omp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; libsais16_compute_phi_omp(SA, PLCP, n, threads); libsais16_compute_plcp_omp(T, PLCP, n, threads); return 0; } int32_t libsais16_plcp_gsa_omp(const uint16_t * T, const int32_t * SA, int32_t * PLCP, int32_t n, int32_t threads) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; libsais16_compute_phi_omp(SA, PLCP, n, threads); libsais16_compute_plcp_gsa_omp(T, PLCP, n, threads); return 0; } int32_t libsais16_lcp_omp(const int32_t * PLCP, const int32_t * SA, int32_t * LCP, int32_t n, int32_t threads) { if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { LCP[0] = PLCP[SA[0]]; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; libsais16_compute_lcp_omp(PLCP, SA, LCP, n, threads); return 0; } #endif ================================================ FILE: src/libsais16x64.c ================================================ /*-- This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2025 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Please see the file LICENSE for full copyright information. --*/ #include "libsais16.h" #include "libsais16x64.h" #include #include #include #include #include #if defined(LIBSAIS_OPENMP) #include #else #define UNUSED(_x) (void)(_x) #endif typedef int64_t sa_sint_t; typedef uint64_t sa_uint_t; typedef int64_t fast_sint_t; typedef uint64_t fast_uint_t; #define SAINT_BIT (64) #define SAINT_MAX INT64_MAX #define SAINT_MIN INT64_MIN #define ALPHABET_SIZE (1 << CHAR_BIT << CHAR_BIT) #define UNBWT_FASTBITS (17) #define SUFFIX_GROUP_BIT (SAINT_BIT - 1) #define SUFFIX_GROUP_MARKER (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1)) #define BUCKETS_INDEX2(_c, _s) ((((fast_sint_t)_c) << 1) + (fast_sint_t)(_s)) #define BUCKETS_INDEX4(_c, _s) ((((fast_sint_t)_c) << 2) + (fast_sint_t)(_s)) #define LIBSAIS_LOCAL_BUFFER_SIZE (1000) #define LIBSAIS_PER_THREAD_CACHE_SIZE (2097184) #define LIBSAIS_FLAGS_NONE (0) #define LIBSAIS_FLAGS_BWT (1) #define LIBSAIS_FLAGS_GSA (2) typedef struct LIBSAIS_THREAD_CACHE { sa_sint_t symbol; sa_sint_t index; } LIBSAIS_THREAD_CACHE; typedef union LIBSAIS_THREAD_STATE { struct { fast_sint_t position; fast_sint_t count; fast_sint_t m; fast_sint_t last_lms_suffix; sa_sint_t * buckets; LIBSAIS_THREAD_CACHE * cache; } state; uint8_t padding[64]; } LIBSAIS_THREAD_STATE; typedef struct LIBSAIS_CONTEXT { sa_sint_t * buckets; LIBSAIS_THREAD_STATE * thread_state; fast_sint_t threads; } LIBSAIS_CONTEXT; typedef struct LIBSAIS_UNBWT_CONTEXT { sa_uint_t * bucket2; uint16_t * fastbits; sa_uint_t * buckets; fast_sint_t threads; } LIBSAIS_UNBWT_CONTEXT; #if defined(__GNUC__) || defined(__clang__) #define RESTRICT __restrict__ #elif defined(_MSC_VER) || defined(__INTEL_COMPILER) #define RESTRICT __restrict #else #error Your compiler, configuration or platform is not supported. #endif #if defined(__has_builtin) #if __has_builtin(__builtin_prefetch) #define HAS_BUILTIN_PREFETCH #endif #elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4)) #define HAS_BUILTIN_PREFETCH #endif #if defined(HAS_BUILTIN_PREFETCH) #define libsais16x64_prefetchr(address) __builtin_prefetch((const void *)(address), 0, 3) #define libsais16x64_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 3) #elif defined (_M_IX86) || defined (_M_AMD64) #include #define libsais16x64_prefetchr(address) _mm_prefetch((const void *)(address), _MM_HINT_T0) #define libsais16x64_prefetchw(address) _m_prefetchw((const void *)(address)) #elif defined (_M_ARM) #include #define libsais16x64_prefetchr(address) __prefetch((const void *)(address)) #define libsais16x64_prefetchw(address) __prefetchw((const void *)(address)) #elif defined (_M_ARM64) #include #define libsais16x64_prefetchr(address) __prefetch2((const void *)(address), 0) #define libsais16x64_prefetchw(address) __prefetch2((const void *)(address), 16) #else #error Your compiler, configuration or platform is not supported. #endif #if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) #if defined(_LITTLE_ENDIAN) \ || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) \ || (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) \ || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) \ || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) #define __LITTLE_ENDIAN__ #elif defined(_BIG_ENDIAN) \ || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) \ || (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) \ || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) \ || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define __BIG_ENDIAN__ #elif defined(_WIN32) #define __LITTLE_ENDIAN__ #endif #endif static void * libsais16x64_align_up(const void * address, size_t alignment) { return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment))); } static void * libsais16x64_alloc_aligned(size_t size, size_t alignment) { void * address = malloc(size + sizeof(short) + alignment - 1); if (address != NULL) { void * aligned_address = libsais16x64_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment); ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address); return aligned_address; } return NULL; } static void libsais16x64_free_aligned(void * aligned_address) { if (aligned_address != NULL) { free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1])); } } static LIBSAIS_THREAD_STATE * libsais16x64_alloc_thread_state(sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = (LIBSAIS_THREAD_STATE *)libsais16x64_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096); sa_sint_t * RESTRICT thread_buckets = (sa_sint_t *)libsais16x64_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); LIBSAIS_THREAD_CACHE * RESTRICT thread_cache = (LIBSAIS_THREAD_CACHE *)libsais16x64_alloc_aligned((size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096); if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL) { fast_sint_t t; for (t = 0; t < threads; ++t) { thread_state[t].state.buckets = thread_buckets; thread_buckets += 4 * ALPHABET_SIZE; thread_state[t].state.cache = thread_cache; thread_cache += LIBSAIS_PER_THREAD_CACHE_SIZE; } return thread_state; } libsais16x64_free_aligned(thread_cache); libsais16x64_free_aligned(thread_buckets); libsais16x64_free_aligned(thread_state); return NULL; } static void libsais16x64_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) { if (thread_state != NULL) { libsais16x64_free_aligned(thread_state[0].state.cache); libsais16x64_free_aligned(thread_state[0].state.buckets); libsais16x64_free_aligned(thread_state); } } #if defined(LIBSAIS_OPENMP) static sa_sint_t libsais16x64_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { sa_sint_t count = 0; fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] < 0); } return count; } static sa_sint_t libsais16x64_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { sa_sint_t count = 0; fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] == 0); } return count; } static void libsais16x64_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16x64_prefetchr(&cache[i + 2 * prefetch_distance]); libsais16x64_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]); libsais16x64_prefetchw(&SA[cache[i + prefetch_distance + 1].symbol]); libsais16x64_prefetchw(&SA[cache[i + prefetch_distance + 2].symbol]); libsais16x64_prefetchw(&SA[cache[i + prefetch_distance + 3].symbol]); SA[cache[i + 0].symbol] = cache[i + 0].index; SA[cache[i + 1].symbol] = cache[i + 1].index; SA[cache[i + 2].symbol] = cache[i + 2].index; SA[cache[i + 3].symbol] = cache[i + 3].index; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[cache[i].symbol] = cache[i].index; } } static void libsais16x64_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { libsais16x64_prefetchw(&cache[i + prefetch_distance]); cache[l] = cache[i + 0]; l += cache[l].symbol >= 0; cache[l] = cache[i + 1]; l += cache[l].symbol >= 0; cache[l] = cache[i + 2]; l += cache[l].symbol >= 0; cache[l] = cache[i + 3]; l += cache[l].symbol >= 0; } for (j += 3; i < j; i += 1) { cache[l] = cache[i]; l += cache[l].symbol >= 0; } libsais16x64_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start); } static void libsais16x64_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s]; } } static void libsais16x64_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s]; } } static void libsais16x64_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s]; } } static void libsais16x64_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s]; } } static void libsais16x64_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s]; } } static void libsais16x64_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s]; } } static void libsais16x64_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s]; } } static void libsais16x64_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride; sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s]; } } static void libsais16x64_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size, fast_sint_t bucket_stride, fast_sint_t num_buckets) { while (num_buckets >= 9) { libsais16x64_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size, bucket_stride); num_buckets -= 8; } switch (num_buckets) { case 1: break; case 2: libsais16x64_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride); break; case 3: libsais16x64_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride); break; case 4: libsais16x64_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride); break; case 5: libsais16x64_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride); break; case 6: libsais16x64_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride); break; case 7: libsais16x64_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride); break; case 8: libsais16x64_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride); break; default: break; } } #endif static void libsais16x64_flip_suffix_markers_omp(sa_sint_t * RESTRICT SA, sa_sint_t l, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && l >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (l / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : l - omp_block_start; fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { SA[i] ^= SAINT_MIN; } } } static void libsais16x64_gather_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 256; fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4) { libsais16x64_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); } for (j -= 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); } SA[m] = (sa_sint_t)(i + 1); } } static void libsais16x64_gather_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais16x64_gather_lms_suffixes_16u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t > omp_thread_num; --t) { m += thread_state[t].state.m; } libsais16x64_gather_lms_suffixes_16u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start, omp_block_size); #pragma omp barrier if (thread_state[omp_thread_num].state.m > 0) { SA[(fast_sint_t)n - 1 - m] = (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix; } } #endif } } static sa_sint_t libsais16x64_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; sa_sint_t i = n - 2; sa_sint_t m = n - 1; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= 3; i -= 4) { libsais16x64_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i + 1; m -= (sa_sint_t)(f1 & ~f0); c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 0; m -= (sa_sint_t)(f0 & ~f1); c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i - 1; m -= (sa_sint_t)(f1 & ~f0); c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 2; m -= (sa_sint_t)(f0 & ~f1); } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i + 1; m -= (sa_sint_t)(f0 & ~f1); } return n - 1 - m; } static sa_sint_t libsais16x64_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; sa_sint_t i = n - 2; sa_sint_t m = n - 1; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= 3; i -= 4) { libsais16x64_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i + 1; m -= (sa_sint_t)(f1 & ~f0 & (c0 >= 0)); c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 0; m -= (sa_sint_t)(f0 & ~f1 & (c1 >= 0)); c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i - 1; m -= (sa_sint_t)(f1 & ~f0 & (c0 >= 0)); c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 2; m -= (sa_sint_t)(f0 & ~f1 & (c1 >= 0)); } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i + 1; m -= (sa_sint_t)(f0 & ~f1 & (c1 >= 0)); } return n - 1 - m; } #if defined(LIBSAIS_OPENMP) static void libsais16x64_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t i = n - 2; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0)]++; } #endif static void libsais16x64_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t i = n - 2; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++; } #if defined(LIBSAIS_OPENMP) static void libsais16x64_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t i = n - 2; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++; } #endif static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 256; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4) { libsais16x64_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } for (j -= 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { m = libsais16x64_count_and_gather_lms_suffixes_16u(T, SA, n, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.m = libsais16x64_count_and_gather_lms_suffixes_16u(T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start, omp_block_size); if (thread_state[omp_thread_num].state.m > 0) { thread_state[omp_thread_num].state.last_lms_suffix = SA[thread_state[omp_thread_num].state.position - 1]; } } #pragma omp barrier #pragma omp master { memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { m += (sa_sint_t)thread_state[t].state.m; if (t != omp_num_threads - 1 && thread_state[t].state.m > 0) { memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.m], (size_t)thread_state[t].state.m * sizeof(sa_sint_t)); } { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t s; for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) { sa_sint_t A = buckets[s], B = temp_bucket[s]; buckets[s] = A + B; temp_bucket[s] = A; } } } } } #endif } return m; } static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } static sa_sint_t libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]); libsais16x64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0 & (c0 >= 0)); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1 & (c1 >= 0)); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0 & (c0 >= 0)); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1 & (c1 >= 0)); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1 & (c1 >= 0)); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0 & (c0 >= 0)); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } #if defined(LIBSAIS_OPENMP) static fast_sint_t libsais16x64_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size, fast_sint_t num_buckets) { fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024); if (free_space / (num_buckets - 1) >= bucket_size_1024) { return bucket_size_1024; } fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16); if (free_space / (num_buckets - 1) >= bucket_size_16) { return bucket_size_16; } return bucket_size; } static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_4k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(local_buckets); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { m = libsais16x64_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t bucket_size = 4 * (fast_sint_t)k; fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); fast_sint_t bucket_stride = libsais16x64_get_bucket_stride(free_space, bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = libsais16x64_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); } #pragma omp barrier if (omp_thread_num == omp_num_threads - 1) { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { m += (sa_sint_t)thread_state[t].state.count; if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) { memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } } } else { omp_num_threads = omp_num_threads - 1; omp_block_stride = (bucket_size / omp_num_threads) & (-16); omp_block_start = omp_thread_num * omp_block_stride; omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; libsais16x64_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1); } } #endif } return m; } static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(local_buckets); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { m = libsais16x64_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t bucket_size = 2 * (fast_sint_t)k; fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); fast_sint_t bucket_stride = libsais16x64_get_bucket_stride(free_space, bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = libsais16x64_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); } #pragma omp barrier if (omp_thread_num == omp_num_threads - 1) { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { m += (sa_sint_t)thread_state[t].state.count; if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) { memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } } } else { omp_num_threads = omp_num_threads - 1; omp_block_stride = (bucket_size / omp_num_threads) & (-16); omp_block_start = omp_thread_num * omp_block_stride; omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; libsais16x64_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1); } } #endif } return m; } static void libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(local_buckets); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t bucket_size = 2 * (fast_sint_t)k; fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[(fast_sint_t)n + (fast_sint_t)n]); fast_sint_t bucket_stride = libsais16x64_get_bucket_stride(free_space, bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) { m += (sa_sint_t)thread_state[t].state.count; } if (thread_state[omp_thread_num].state.count > 0) { memcpy(&SA[n - m], &SA[n + thread_state[omp_thread_num].state.position - thread_state[omp_thread_num].state.count], (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t)); } } { omp_block_stride = (bucket_size / omp_num_threads) & (-16); omp_block_start = omp_thread_num * omp_block_stride; omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; libsais16x64_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads); } } #endif } } #endif static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { m = libsais16x64_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else if (omp_thread_num == 0) { libsais16x64_count_lms_suffixes_32s_4k(T, n, k, buckets); } else { m = libsais16x64_gather_lms_suffixes_32s(T, SA, n); } #endif } return m; } static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { m = libsais16x64_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else if (omp_thread_num == 0) { libsais16x64_count_lms_suffixes_32s_2k(T, n, k, buckets); } else { m = libsais16x64_gather_lms_suffixes_32s(T, SA, n); } #endif } return m; } static sa_sint_t libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { m = libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else if (omp_thread_num == 0) { libsais16x64_count_compacted_lms_suffixes_32s_2k(T, n, k, buckets); } else { m = libsais16x64_gather_compacted_lms_suffixes_32s(T, SA, n); } #endif } return m; } static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m; #if defined(LIBSAIS_OPENMP) fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); sa_sint_t max_threads = (sa_sint_t)(free_space / ((4 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 16 / k) { max_threads = n / 16 / k; } m = libsais16x64_count_and_gather_lms_suffixes_32s_4k_fs_omp(T, SA, n, k, buckets, local_buckets, max_threads > 2 ? max_threads : 2, thread_state); } else #else UNUSED(local_buckets); UNUSED(thread_state); #endif { m = libsais16x64_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads); } return m; } static sa_sint_t libsais16x64_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m; #if defined(LIBSAIS_OPENMP) fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); sa_sint_t max_threads = (sa_sint_t)(free_space / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } m = libsais16x64_count_and_gather_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, local_buckets, max_threads > 2 ? max_threads : 2, thread_state); } else #else UNUSED(local_buckets); UNUSED(thread_state); #endif { m = libsais16x64_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); } return m; } static void libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[(fast_sint_t)n + (fast_sint_t)n]); sa_sint_t max_threads = (sa_sint_t)(free_space / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (!local_buckets && max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, local_buckets, max_threads > 2 ? max_threads : 2, thread_state); } else #else UNUSED(local_buckets); UNUSED(thread_state); #endif { libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); } } static void libsais16x64_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j; for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) { libsais16x64_prefetchr(&T[i + prefetch_distance]); buckets[T[i + 0]]++; buckets[T[i + 1]]++; buckets[T[i + 2]]++; buckets[T[i + 3]]++; buckets[T[i + 4]]++; buckets[T[i + 5]]++; buckets[T[i + 6]]++; buckets[T[i + 7]]++; } for (j += 7; i < j; i += 1) { buckets[T[i]]++; } } static sa_sint_t libsais16x64_initialize_buckets_start_and_end_16u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq) { sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE]; sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; fast_sint_t k = -1; if (freq != NULL) { fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { sa_sint_t total = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; bucket_start[j] = sum; sum += total; bucket_end[j] = sum; k = total > 0 ? j : k; freq[j] = total; } } else { fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { sa_sint_t total = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; bucket_start[j] = sum; sum += total; bucket_end[j] = sum; k = total > 0 ? j : k; } } return (sa_sint_t)(k + 1); } static void libsais16x64_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { sa_sint_t * RESTRICT bucket_start = &buckets[4 * (fast_sint_t)k]; sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { bucket_start[j] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; bucket_end[j] = sum; } } static void libsais16x64_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) { bucket_start[j] = sum; sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; bucket_end[j] = sum; } } static void libsais16x64_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i; sa_sint_t sum0 = 0; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) { sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; } } static void libsais16x64_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i, j; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) { buckets[j] = buckets[i]; } buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t)); } static void libsais16x64_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i; sa_sint_t sum = 0; for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sa_sint_t tmp = buckets[i]; buckets[i] = sum; sum += tmp; } } static void libsais16x64_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i; sa_sint_t sum = 0; for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; } } static sa_sint_t libsais16x64_initialize_buckets_for_lms_suffixes_radix_sort_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { { fast_uint_t f0 = 0; fast_uint_t f1 = 0; fast_sint_t c0 = T[first_lms_suffix]; fast_sint_t c1 = 0; for (; --first_lms_suffix >= 0; ) { c1 = c0; c0 = T[first_lms_suffix]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]--; } buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0)]--; } { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; } return sum; } } static void libsais16x64_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; fast_sint_t i; sa_sint_t sum0 = 0, sum1 = 0; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) { sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; sum1 += buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; buckets[i + BUCKETS_INDEX2(0, 1)] = sum1; } } static sa_sint_t libsais16x64_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { { fast_uint_t f0 = 0; fast_uint_t f1 = 0; fast_sint_t c0 = T[first_lms_suffix]; fast_sint_t c1 = 0; for (; --first_lms_suffix >= 0; ) { c1 = c0; c0 = T[first_lms_suffix]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]--; } buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0)]--; } { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; } return sum; } } static void libsais16x64_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; fast_sint_t i, j; sa_sint_t sum0 = 0, sum1 = 0; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) { bucket_start[j] = sum1; sum0 += buckets[i + BUCKETS_INDEX2(0, 1)]; sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 1)] = sum0; bucket_end[j] = sum1; } } static void libsais16x64_radix_sort_lms_suffixes_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais16x64_prefetchr(&SA[i - 2 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0]]); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1]]); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 2]]); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 3]]); sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; } } static void libsais16x64_radix_sort_lms_suffixes_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t flags, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (flags & LIBSAIS_FLAGS_GSA) { buckets[4 * ALPHABET_SIZE]--; } #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && m >= 65536 && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { libsais16x64_radix_sort_lms_suffixes_16u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); } #if defined(LIBSAIS_OPENMP) else { { sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets; fast_sint_t i, j; for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1); i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0)) { dst_bucket[i] = src_bucket[i] - dst_bucket[j]; } } { fast_sint_t t, omp_block_start = 0, omp_block_size = thread_state[omp_thread_num].state.m; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) omp_block_start += thread_state[t].state.m; if (omp_block_start == (fast_sint_t)m && omp_block_size > 0) { omp_block_start -= 1; omp_block_size -= 1; } libsais16x64_radix_sort_lms_suffixes_16u(T, SA, thread_state[omp_thread_num].state.buckets, (fast_sint_t)n - omp_block_start, omp_block_size); } } #endif } } static void libsais16x64_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) { libsais16x64_prefetchr(&SA[i - 3 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 0]]); libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 1]]); libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 2]]); libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 3]]); libsais16x64_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]); libsais16x64_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]); libsais16x64_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]); libsais16x64_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]); sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[T[p0]]] = p0; sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[T[p1]]] = p1; sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[T[p2]]] = p2; sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[T[p3]]] = p3; } for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[--induction_bucket[T[p]]] = p; } } static void libsais16x64_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) { libsais16x64_prefetchr(&SA[i - 3 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 0]]); libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 1]]); libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 2]]); libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 3]]); libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]); libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]); libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]); libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]); sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; } for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; } } #if defined(LIBSAIS_OPENMP) static void libsais16x64_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0]]); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1]]); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 2]]); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 3]]); libsais16x64_prefetchw(&cache[i + prefetch_distance]); cache[i + 0].symbol = T[cache[i + 0].index = SA[i + 0]]; cache[i + 1].symbol = T[cache[i + 1].index = SA[i + 1]]; cache[i + 2].symbol = T[cache[i + 2].index = SA[i + 2]]; cache[i + 3].symbol = T[cache[i + 3].index = SA[i + 3]]; } for (j += prefetch_distance + 3; i < j; i += 1) { cache[i].symbol = T[cache[i].index = SA[i]]; } } static void libsais16x64_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais16x64_prefetchw(&cache[i - 2 * prefetch_distance]); libsais16x64_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]); libsais16x64_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]); libsais16x64_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]); libsais16x64_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]); cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol]; cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol]; cache[i - 2].symbol = --induction_bucket[cache[i - 2].symbol]; cache[i - 3].symbol = --induction_bucket[cache[i - 3].symbol]; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { cache[i].symbol = --induction_bucket[cache[i].symbol]; } } static void libsais16x64_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais16x64_prefetchw(&cache[i - 2 * prefetch_distance]); libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]); libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]); libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]); libsais16x64_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]); cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)]; cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)]; cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)]; cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)]; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)]; } } static void libsais16x64_radix_sort_lms_suffixes_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16x64_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16x64_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais16x64_radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16x64_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } static void libsais16x64_radix_sort_lms_suffixes_32s_2k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16x64_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16x64_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais16x64_radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16x64_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static void libsais16x64_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || m < 65536) { libsais16x64_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; } libsais16x64_radix_sort_lms_suffixes_32s_6k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static void libsais16x64_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || m < 65536) { libsais16x64_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; } libsais16x64_radix_sort_lms_suffixes_32s_2k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static sa_sint_t libsais16x64_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; sa_sint_t i = n - 2; sa_sint_t m = 0; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; fast_sint_t c2 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais16x64_prefetchr(&T[i - 2 * prefetch_distance]); libsais16x64_prefetchw(&buckets[T[i - prefetch_distance - 0]]); libsais16x64_prefetchw(&buckets[T[i - prefetch_distance - 1]]); libsais16x64_prefetchw(&buckets[T[i - prefetch_distance - 2]]); libsais16x64_prefetchw(&buckets[T[i - prefetch_distance - 3]]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); if (f1 & ~f0) { SA[--buckets[c2 = c0]] = i + 1; m++; } c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); if (f0 & ~f1) { SA[--buckets[c2 = c1]] = i - 0; m++; } c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); if (f1 & ~f0) { SA[--buckets[c2 = c0]] = i - 1; m++; } c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); if (f0 & ~f1) { SA[--buckets[c2 = c1]] = i - 2; m++; } } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); if (f0 & ~f1) { SA[--buckets[c2 = c1]] = i + 1; m++; } } if (m > 1) { SA[buckets[c2]] = 0; } return m; } static void libsais16x64_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16x64_prefetchr(&induction_bucket[i + 2 * prefetch_distance]); libsais16x64_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]); libsais16x64_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]); libsais16x64_prefetchw(&SA[induction_bucket[i + prefetch_distance + 2]]); libsais16x64_prefetchw(&SA[induction_bucket[i + prefetch_distance + 3]]); SA[induction_bucket[i + 0]] |= SAINT_MIN; SA[induction_bucket[i + 1]] |= SAINT_MIN; SA[induction_bucket[i + 2]] |= SAINT_MIN; SA[induction_bucket[i + 3]] |= SAINT_MIN; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[induction_bucket[i]] |= SAINT_MIN; } } static void libsais16x64_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16x64_prefetchr(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]); libsais16x64_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]); libsais16x64_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]); libsais16x64_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]); libsais16x64_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]); SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER; SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER; SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= SUFFIX_GROUP_MARKER; SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER; } } static void libsais16x64_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)k - 1; #endif libsais16x64_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size); } } static void libsais16x64_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)k - 1; #endif libsais16x64_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size); } } static void libsais16x64_initialize_buckets_for_partial_sorting_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++; fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0; for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)]; sum1 += buckets[i + BUCKETS_INDEX4(0, 1)]; buckets[j + BUCKETS_INDEX2(0, 0)] = sum0; buckets[j + BUCKETS_INDEX2(0, 1)] = sum1; } } static void libsais16x64_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0; for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i != BUCKETS_INDEX4((fast_sint_t)first_lms_suffix, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)]; sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)]; sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)]; sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)]; buckets[i + BUCKETS_INDEX4(0, 0)] = sum0; buckets[i + BUCKETS_INDEX4(0, 1)] = sum2; buckets[i + BUCKETS_INDEX4(0, 2)] = 0; buckets[i + BUCKETS_INDEX4(0, 3)] = 0; sum0 += SS + SL; sum1 += LS; sum2 += LS + LL; temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1; } for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)]; sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)]; sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)]; sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)]; buckets[i + BUCKETS_INDEX4(0, 0)] = sum0; buckets[i + BUCKETS_INDEX4(0, 1)] = sum2; buckets[i + BUCKETS_INDEX4(0, 2)] = 0; buckets[i + BUCKETS_INDEX4(0, 3)] = 0; sum0 += SS + SL; sum1 += LS; sum2 += LS + LL; temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1; } } static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); SA[induction_bucket[v0]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); SA[induction_bucket[v1]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); SA[induction_bucket[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } return d; } #if defined(LIBSAIS_OPENMP) static void libsais16x64_partial_sorting_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; memset(induction_bucket, 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); memset(distinct_names , 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = cache[count].index = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; sa_sint_t p1 = cache[count].index = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); induction_bucket[v]++; distinct_names[v] = d; } state[0].state.position = (fast_sint_t)d - 1; state[0].state.count = count; } static void libsais16x64_partial_sorting_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { libsais16x64_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; SA[induction_bucket[v0]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; SA[induction_bucket[v1]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j += 1; i < j; i += 1) { sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; SA[induction_bucket[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16x64_partial_sorting_scan_left_to_right_16u(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16x64_partial_sorting_scan_left_to_right_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); } #pragma omp barrier #pragma omp master { sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; fast_sint_t c; for (c = 0; c < 2 * k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A + B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * k; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; } } #pragma omp barrier { libsais16x64_partial_sorting_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); } } #endif } return d; } #endif static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN; distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; if (threads == 1 || left_suffixes_count < 65536) { d = libsais16x64_partial_sorting_scan_left_to_right_16u(T, SA, buckets, d, 0, left_suffixes_count); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < left_suffixes_count; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > left_suffixes_count) { block_max_end = left_suffixes_count;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); SA[induction_bucket[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } else { d = libsais16x64_partial_sorting_scan_left_to_right_16u_block_omp(T, SA, k, buckets, d, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif return d; } static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchr(&SA[i + 3 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2); libsais16x64_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais16x64_prefetchw(&buckets[v0]); sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais16x64_prefetchw(&buckets[v1]); sa_sint_t p2 = SA[i + 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]); SA[buckets[v2]++] = (p2 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d; sa_sint_t p3 = SA[i + 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]); SA[buckets[v3]++] = (p3 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d; } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); SA[buckets[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; } return d; } static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 3 * prefetch_distance]); sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais16x64_prefetchr(Ts0 - 1); libsais16x64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais16x64_prefetchr(Ts1 - 1); libsais16x64_prefetchr(Ts1 - 2); sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16x64_prefetchw(&induction_bucket[Ts2]); libsais16x64_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16x64_prefetchw(&induction_bucket[Ts3]); libsais16x64_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; } } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); SA[induction_bucket[T[p - 1]]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; } } return d; } static void libsais16x64_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 3 * prefetch_distance]); sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais16x64_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais16x64_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16x64_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16x64_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); } } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static void libsais16x64_partial_sorting_scan_left_to_right_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); libsais16x64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); } cache[i + 0].symbol = symbol0; sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); } cache[i].symbol = symbol; } } static void libsais16x64_partial_sorting_scan_left_to_right_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais16x64_prefetchr(Ts0 - 1); libsais16x64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais16x64_prefetchr(Ts1 - 1); libsais16x64_prefetchr(Ts1 - 2); libsais16x64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX; } } static void libsais16x64_partial_sorting_scan_left_to_right_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais16x64_prefetchr(Ts0 - 1); libsais16x64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais16x64_prefetchr(Ts1 - 1); libsais16x64_prefetchr(Ts1 - 2); libsais16x64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX; } } static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&cache[i + 2 * prefetch_distance]); libsais16x64_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]); libsais16x64_prefetchw(&buckets[cache[i + prefetch_distance + 1].symbol]); sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index; d += (p0 < 0); cache[i + 0].symbol = buckets[v0]++; cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t s = cache[i + 0].symbol, q = (cache[s].index = cache[i + 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index; d += (p1 < 0); cache[i + 1].symbol = buckets[v1]++; cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t s = cache[i + 1].symbol, q = (cache[s].index = cache[i + 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = buckets[v]++; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; if (cache[i].symbol < omp_block_end) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } } return d; } static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&cache[i + 2 * prefetch_distance]); sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 >> 1 : 0]; libsais16x64_prefetchw(Is0); const sa_sint_t * Ds0 = &distinct_names[s0 > 0 ? s0 : 0]; libsais16x64_prefetchw(Ds0); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 >> 1 : 0]; libsais16x64_prefetchw(Is1); const sa_sint_t * Ds1 = &distinct_names[s1 > 0 ? s1 : 0]; libsais16x64_prefetchw(Ds1); sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { sa_sint_t p0 = cache[i + 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 0].symbol = induction_bucket[v0 >> 1]++; cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 0].index = np & SAINT_MAX; } } sa_sint_t v1 = cache[i + 1].symbol; if (v1 >= 0) { sa_sint_t p1 = cache[i + 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 1].symbol = induction_bucket[v1 >> 1]++; cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 1].index = np & SAINT_MAX; } } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = induction_bucket[v >> 1]++; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i].index = np & SAINT_MAX; } } } return d; } static void libsais16x64_partial_sorting_scan_left_to_right_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&cache[i + 2 * prefetch_distance]); sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais16x64_prefetchw(Is0); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais16x64_prefetchw(Is1); sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { cache[i + 0].symbol = induction_bucket[v0]++; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 0].index = np & SAINT_MAX; } } sa_sint_t v1 = cache[i + 1].symbol; if (v1 >= 0) { cache[i + 1].symbol = induction_bucket[v1]++; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 1].index = np & SAINT_MAX; } } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = induction_bucket[v]++; if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i].index = np & SAINT_MAX; } } } } static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16x64_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16x64_partial_sorting_scan_left_to_right_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais16x64_partial_sorting_scan_left_to_right_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16x64_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16x64_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16x64_partial_sorting_scan_left_to_right_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais16x64_partial_sorting_scan_left_to_right_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16x64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static void libsais16x64_partial_sorting_scan_left_to_right_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16x64_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16x64_partial_sorting_scan_left_to_right_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais16x64_partial_sorting_scan_left_to_right_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16x64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN; buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; if (threads == 1 || left_suffixes_count < 65536) { d = libsais16x64_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < left_suffixes_count; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > left_suffixes_count) { block_end = left_suffixes_count; } d = libsais16x64_partial_sorting_scan_left_to_right_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif return d; } static sa_sint_t libsais16x64_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; SA[induction_bucket[T[n - 1]]++] = (n - 1) | (sa_sint_t)((sa_uint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER; distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d; if (threads == 1 || n < 65536) { d = libsais16x64_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < n; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } d = libsais16x64_partial_sorting_scan_left_to_right_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif return d; } static void libsais16x64_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[buckets[T[n - 1]]++] = (n - 1) | (sa_sint_t)((sa_uint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais16x64_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < n; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } libsais16x64_partial_sorting_scan_left_to_right_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static void libsais16x64_partial_sorting_shift_markers_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets, sa_sint_t threads) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; fast_sint_t c; #if defined(LIBSAIS_OPENMP) #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536) #else UNUSED(threads); UNUSED(n); #endif for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0)) { fast_sint_t i, j; sa_sint_t s = SAINT_MIN; for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4) { libsais16x64_prefetchw(&SA[i - prefetch_distance]); sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; } for (j -= 3; i >= j; i -= 1) { sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; } } } static void libsais16x64_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets, sa_sint_t threads) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t c; #if defined(LIBSAIS_OPENMP) #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && k >= 65536) #else UNUSED(threads); #endif for (c = (fast_sint_t)k - 1; c >= 1; c -= 1) { fast_sint_t i, j; sa_sint_t s = SAINT_MIN; for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1, j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3; i >= j; i -= 4) { libsais16x64_prefetchw(&SA[i - prefetch_distance]); sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; } for (j -= 3; i >= j; i -= 1) { sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; } } } static void libsais16x64_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; fast_sint_t i; sa_sint_t s = SUFFIX_GROUP_MARKER; for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) { libsais16x64_prefetchw(&SA[i - prefetch_distance]); sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0; sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1; sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2; sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3; } for (; i >= 0; i -= 1) { sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q; } } static void libsais16x64_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) { buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)]; buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)]; } } static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchr(&SA[i - 2 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } return d; } static sa_sint_t libsais16x64_partial_gsa_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchr(&SA[i - 2 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); if (v0 != 1) { SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); if (v1 != 1) { SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); if (v != 1) { SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } return d; } #if defined(LIBSAIS_OPENMP) static void libsais16x64_partial_sorting_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; memset(induction_bucket, 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); memset(distinct_names , 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchr(&SA[i - 2 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = cache[count].index = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; sa_sint_t p1 = cache[count].index = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); induction_bucket[v]++; distinct_names[v] = d; } state[0].state.position = (fast_sint_t)d - 1; state[0].state.count = count; } static void libsais16x64_partial_sorting_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { libsais16x64_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j += 1; i < j; i += 1) { sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } static void libsais16x64_partial_gsa_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { libsais16x64_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; if (v0 != 1) { SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; if (v1 != 1) { SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } } for (j += 1; i < j; i += 1) { sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; if (v != 1) { SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } } static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16x64_partial_sorting_scan_right_to_left_16u(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16x64_partial_sorting_scan_right_to_left_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); } #pragma omp barrier #pragma omp master { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; fast_sint_t c; for (c = 0; c < 2 * k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * k; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; } } #pragma omp barrier { libsais16x64_partial_sorting_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); } } #endif } return d; } static sa_sint_t libsais16x64_partial_gsa_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16x64_partial_gsa_scan_right_to_left_16u(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16x64_partial_sorting_scan_right_to_left_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); } #pragma omp barrier #pragma omp master { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; fast_sint_t c; for (c = 0; c < 2 * k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * k; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; } } #pragma omp barrier { libsais16x64_partial_gsa_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); } } #endif } return d; } #endif static void libsais16x64_partial_sorting_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; if (threads == 1 || (scan_end - scan_start) < 65536) { libsais16x64_partial_sorting_scan_right_to_left_16u(T, SA, buckets, d, scan_start, scan_end - scan_start); } #if defined(LIBSAIS_OPENMP) else { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t block_start; for (block_start = scan_end - 1; block_start >= scan_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } else { d = libsais16x64_partial_sorting_scan_right_to_left_16u_block_omp(T, SA, k, buckets, d, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16x64_partial_gsa_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; if (threads == 1 || (scan_end - scan_start) < 65536) { libsais16x64_partial_gsa_scan_right_to_left_16u(T, SA, buckets, d, scan_start, scan_end - scan_start); } #if defined(LIBSAIS_OPENMP) else { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t block_start; for (block_start = scan_end - 1; block_start >= scan_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); if (v != 1) { SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } } else { d = libsais16x64_partial_gsa_scan_right_to_left_16u_block_omp(T, SA, k, buckets, d, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchr(&SA[i - 3 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2); libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais16x64_prefetchw(&buckets[v0]); sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais16x64_prefetchw(&buckets[v1]); sa_sint_t p2 = SA[i - 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]); SA[--buckets[v2]] = (p2 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d; sa_sint_t p3 = SA[i - 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]); SA[--buckets[v3]] = (p3 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d; } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); SA[--buckets[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; } return d; } static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&SA[i - 3 * prefetch_distance]); sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais16x64_prefetchr(Ts0 - 1); libsais16x64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais16x64_prefetchr(Ts1 - 1); libsais16x64_prefetchr(Ts1 - 2); sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16x64_prefetchw(&induction_bucket[Ts2]); libsais16x64_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais16x64_prefetchw(&induction_bucket[Ts3]); libsais16x64_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; } } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); SA[--induction_bucket[T[p - 1]]] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; } } return d; } static void libsais16x64_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&SA[i - 3 * prefetch_distance]); sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais16x64_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais16x64_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16x64_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16x64_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); } } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static void libsais16x64_partial_sorting_scan_right_to_left_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais16x64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); libsais16x64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0; sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol; } } static void libsais16x64_partial_sorting_scan_right_to_left_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais16x64_prefetchr(Ts0 - 1); libsais16x64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais16x64_prefetchr(Ts1 - 1); libsais16x64_prefetchr(Ts1 - 2); libsais16x64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol; } } static void libsais16x64_partial_sorting_scan_right_to_left_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais16x64_prefetchr(Ts0 - 1); libsais16x64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais16x64_prefetchr(Ts1 - 1); libsais16x64_prefetchr(Ts1 - 2); libsais16x64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; } cache[i].symbol = symbol; } } static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&cache[i - 2 * prefetch_distance]); libsais16x64_prefetchw(&buckets[cache[i - prefetch_distance - 0].symbol]); libsais16x64_prefetchw(&buckets[cache[i - prefetch_distance - 1].symbol]); sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index; d += (p0 < 0); cache[i - 0].symbol = --buckets[v0]; cache[i - 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t s = cache[i - 0].symbol, q = (cache[s].index = cache[i - 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index; d += (p1 < 0); cache[i - 1].symbol = --buckets[v1]; cache[i - 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t s = cache[i - 1].symbol, q = (cache[s].index = cache[i - 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = --buckets[v]; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; if (cache[i].symbol >= omp_block_start) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } } return d; } static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&cache[i - 2 * prefetch_distance]); sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 >> 1 : 0]; libsais16x64_prefetchw(Is0); const sa_sint_t * Ds0 = &distinct_names[s0 > 0 ? s0 : 0]; libsais16x64_prefetchw(Ds0); sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 >> 1 : 0]; libsais16x64_prefetchw(Is1); const sa_sint_t * Ds1 = &distinct_names[s1 > 0 ? s1 : 0]; libsais16x64_prefetchw(Ds1); sa_sint_t v0 = cache[i - 0].symbol; if (v0 >= 0) { sa_sint_t p0 = cache[i - 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 0].symbol = --induction_bucket[v0 >> 1]; cache[i - 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } } sa_sint_t v1 = cache[i - 1].symbol; if (v1 >= 0) { sa_sint_t p1 = cache[i - 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 1].symbol = --induction_bucket[v1 >> 1]; cache[i - 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = --induction_bucket[v >> 1]; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } } } return d; } static void libsais16x64_partial_sorting_scan_right_to_left_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&cache[i - 2 * prefetch_distance]); sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais16x64_prefetchw(Is0); sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais16x64_prefetchw(Is1); sa_sint_t v0 = cache[i - 0].symbol; if (v0 >= 0) { cache[i - 0].symbol = --induction_bucket[v0]; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } } } sa_sint_t v1 = cache[i - 1].symbol; if (v1 >= 0) { cache[i - 1].symbol = --induction_bucket[v1]; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; }} } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = --induction_bucket[v]; if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } } } } } static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16x64_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16x64_partial_sorting_scan_right_to_left_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais16x64_partial_sorting_scan_right_to_left_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16x64_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais16x64_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16x64_partial_sorting_scan_right_to_left_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais16x64_partial_sorting_scan_right_to_left_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16x64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static void libsais16x64_partial_sorting_scan_right_to_left_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16x64_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16x64_partial_sorting_scan_right_to_left_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais16x64_partial_sorting_scan_right_to_left_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16x64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; if (threads == 1 || (scan_end - scan_start) < 65536) { d = libsais16x64_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < scan_start) { block_end = scan_start - 1; } d = libsais16x64_partial_sorting_scan_right_to_left_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif return d; } static sa_sint_t libsais16x64_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { d = libsais16x64_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } d = libsais16x64_partial_sorting_scan_right_to_left_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif return d; } static void libsais16x64_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { libsais16x64_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } libsais16x64_partial_sorting_scan_right_to_left_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif } static fast_sint_t libsais16x64_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { libsais16x64_prefetchr(&SA[i + prefetch_distance]); sa_uint_t s0 = (sa_uint_t)SA[i + 0]; SA[l] = (sa_sint_t)((s0 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s0 < 0); sa_uint_t s1 = (sa_uint_t)SA[i + 1]; SA[l] = (sa_sint_t)((s1 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s1 < 0); sa_uint_t s2 = (sa_uint_t)SA[i + 2]; SA[l] = (sa_sint_t)((s2 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s2 < 0); sa_uint_t s3 = (sa_uint_t)SA[i + 3]; SA[l] = (sa_sint_t)((s3 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s3 < 0); } for (j += 3; i < j; i += 1) { sa_uint_t s = (sa_uint_t)SA[i]; SA[l] = (sa_sint_t)((s - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s < 0); } return l; } static fast_sint_t libsais16x64_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { libsais16x64_prefetchr(&SA[i + prefetch_distance]); sa_sint_t s0 = SA[i + 0]; SA[l] = s0 & SAINT_MAX; l += (s0 < 0); sa_sint_t s1 = SA[i + 1]; SA[l] = s1 & SAINT_MAX; l += (s1 < 0); sa_sint_t s2 = SA[i + 2]; SA[l] = s2 & SAINT_MAX; l += (s2 < 0); sa_sint_t s3 = SA[i + 3]; SA[l] = s3 & SAINT_MAX; l += (s3 < 0); } for (j += 3; i < j; i += 1) { sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l += (s < 0); } return l; } static void libsais16x64_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais16x64_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = omp_block_start; thread_state[omp_thread_num].state.count = libsais16x64_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size) - omp_block_start; } #pragma omp barrier #pragma omp master { fast_sint_t t, position = 0; for (t = 0; t < omp_num_threads; ++t) { if (t > 0 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } position += thread_state[t].state.count; } } } #endif } } static void libsais16x64_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais16x64_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = omp_block_start; thread_state[omp_thread_num].state.count = libsais16x64_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size) - omp_block_start; } #pragma omp barrier #pragma omp master { fast_sint_t t, position = 0; for (t = 0; t < omp_num_threads; ++t) { if (t > 0 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } position += thread_state[t].state.count; } } } #endif } } static void libsais16x64_induce_partial_order_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t flags, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(&buckets[2 * ALPHABET_SIZE], 0, (size_t)2 * ALPHABET_SIZE * sizeof(sa_sint_t)); if (flags & LIBSAIS_FLAGS_GSA) { buckets[4 * ALPHABET_SIZE + BUCKETS_INDEX2(0, 1)] = buckets[4 * ALPHABET_SIZE + BUCKETS_INDEX2(1, 1)] - 1; libsais16x64_flip_suffix_markers_omp(SA, buckets[4 * ALPHABET_SIZE + BUCKETS_INDEX2(0, 1)], threads); } sa_sint_t d = libsais16x64_partial_sorting_scan_left_to_right_16u_omp(T, SA, n, k, buckets, left_suffixes_count, 0, threads, thread_state); libsais16x64_partial_sorting_shift_markers_16u_omp(SA, n, buckets, threads); if (flags & LIBSAIS_FLAGS_GSA) { libsais16x64_partial_gsa_scan_right_to_left_16u_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); if (T[first_lms_suffix] == 0) { memmove(&SA[1], &SA[0], (size_t)(buckets[BUCKETS_INDEX2(1, 1)] - 1) * sizeof(sa_sint_t)); SA[0] = first_lms_suffix | SAINT_MIN; } buckets[BUCKETS_INDEX2(0, 1)] = 0; } else { libsais16x64_partial_sorting_scan_right_to_left_16u_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); } } static void libsais16x64_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t d = libsais16x64_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state); libsais16x64_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads); libsais16x64_partial_sorting_shift_buckets_32s_6k(k, buckets); libsais16x64_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); } static void libsais16x64_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t d = libsais16x64_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state); libsais16x64_partial_sorting_shift_markers_32s_4k(SA, n); libsais16x64_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state); libsais16x64_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state); } static void libsais16x64_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16x64_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); libsais16x64_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); libsais16x64_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); } static void libsais16x64_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16x64_count_suffixes_32s(T, n, k, buckets); libsais16x64_initialize_buckets_start_32s_1k(k, buckets); libsais16x64_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state); libsais16x64_count_suffixes_32s(T, n, k, buckets); libsais16x64_initialize_buckets_end_32s_1k(k, buckets); libsais16x64_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state); libsais16x64_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); } static sa_sint_t libsais16x64_renumber_lms_suffixes_16u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]); libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]); libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]); libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]); sa_sint_t p0 = SA[i + 0]; SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p0 < 0; sa_sint_t p1 = SA[i + 1]; SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p1 < 0; sa_sint_t p2 = SA[i + 2]; SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p2 < 0; sa_sint_t p3 = SA[i + 3]; SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p3 < 0; } for (j += prefetch_distance + 3; i < j; i += 1) { sa_sint_t p = SA[i]; SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p < 0; } return name; } static fast_sint_t libsais16x64_gather_marked_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; l -= 1; fast_sint_t i, j; for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) { libsais16x64_prefetchr(&SA[i - prefetch_distance]); sa_sint_t s0 = SA[i - 0]; SA[l] = s0 & SAINT_MAX; l -= s0 < 0; sa_sint_t s1 = SA[i - 1]; SA[l] = s1 & SAINT_MAX; l -= s1 < 0; sa_sint_t s2 = SA[i - 2]; SA[l] = s2 & SAINT_MAX; l -= s2 < 0; sa_sint_t s3 = SA[i - 3]; SA[l] = s3 & SAINT_MAX; l -= s3 < 0; } for (j -= 3; i >= j; i -= 1) { sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l -= s < 0; } l += 1; return l; } static sa_sint_t libsais16x64_renumber_lms_suffixes_16u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t name = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { name = libsais16x64_renumber_lms_suffixes_16u(SA, m, 0, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16x64_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } if (omp_thread_num == omp_num_threads - 1) { name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); } libsais16x64_renumber_lms_suffixes_16u(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); } } #endif } return name; } static void libsais16x64_gather_marked_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; if (omp_num_threads == 1) { libsais16x64_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { if (omp_thread_num < omp_num_threads - 1) { thread_state[omp_thread_num].state.position = libsais16x64_gather_marked_lms_suffixes(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size); thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position; } else { thread_state[omp_thread_num].state.position = libsais16x64_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size); thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position; } } #pragma omp barrier #pragma omp master { fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs; for (t = omp_num_threads - 1; t >= 0; --t) { position -= thread_state[t].state.count; if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } } } } #endif } } static sa_sint_t libsais16x64_renumber_and_gather_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t)); sa_sint_t name = libsais16x64_renumber_lms_suffixes_16u_omp(SA, m, threads, thread_state); if (name < m) { libsais16x64_gather_marked_lms_suffixes_omp(SA, n, m, fs, threads, thread_state); } else { fast_sint_t i; for (i = 0; i < m; i += 1) { SA[i] &= SAINT_MAX; } } return name; } static sa_sint_t libsais16x64_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]); libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]); libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]); libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]); libsais16x64_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]); p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN); name += p0 < 0; p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN); name += p1 < 0; p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN); name += p2 < 0; p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0; } for (j += prefetch_distance + 3; i < j; i += 1) { p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0; } return name; } static void libsais16x64_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0; for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j; i += 4) { libsais16x64_prefetchw(&SA[i + prefetch_distance]); p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | SAINT_MAX); p0 = (p0 == 0) ? p3 : p0; p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | SAINT_MAX); p1 = (p1 == 0) ? p0 : p1; p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | SAINT_MAX); p2 = (p2 == 0) ? p1 : p2; p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3; } for (j += 3; i < j; i += 1) { p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3; } } static void libsais16x64_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) { libsais16x64_prefetchw(&SAm[i + prefetch_distance]); SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX; SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX; SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX; SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX; } for (j += 3; i < j; i += 1) { SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX; } } static sa_sint_t libsais16x64_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t name = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { name = libsais16x64_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16x64_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 1; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } if (omp_thread_num == omp_num_threads - 1) { name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); } libsais16x64_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); } } #endif } return name - 1; } static void libsais16x64_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)n >> 1; #endif libsais16x64_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size); } } static void libsais16x64_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)n >> 1; #endif libsais16x64_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size); } } static sa_sint_t libsais16x64_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t)); sa_sint_t name = libsais16x64_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state); if (name < m) { libsais16x64_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads); } return name; } static sa_sint_t libsais16x64_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; { libsais16x64_gather_lms_suffixes_32s(T, SA, n); memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t)); fast_sint_t i, j; for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4) { libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN; SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN; SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN; SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN; } for (j += prefetch_distance + 3; i < j; i += 1) { SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN; } SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN; } { libsais16x64_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads); } sa_sint_t name = 1; { fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1]; sa_sint_t pdiff = SAINT_MIN; for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais16x64_prefetchr(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]); libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais16x64_prefetchr(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]); fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN; if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN; } SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = SAINT_MIN; if (qlen == plen) { fast_sint_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (sa_sint_t)(l - plen) & SAINT_MIN; } SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0); } for (j += prefetch_distance + 1; i < j; i += 1) { fast_sint_t q = SA[i], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN; if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (sa_sint_t)(l - plen) & SAINT_MIN; } SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); p = q; plen = qlen; pdiff = qdiff; } SAm[p >> 1] = name | pdiff; name++; } if (name <= m) { libsais16x64_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads); } return name - 1; } static void libsais16x64_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT SAnm = &SA[n - m]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]); libsais16x64_prefetchr(&SAnm[SA[i + prefetch_distance + 0]]); libsais16x64_prefetchr(&SAnm[SA[i + prefetch_distance + 1]]); libsais16x64_prefetchr(&SAnm[SA[i + prefetch_distance + 2]]); libsais16x64_prefetchr(&SAnm[SA[i + prefetch_distance + 3]]); SA[i + 0] = SAnm[SA[i + 0]]; SA[i + 1] = SAnm[SA[i + 1]]; SA[i + 2] = SAnm[SA[i + 2]]; SA[i + 3] = SAnm[SA[i + 3]]; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[i] = SAnm[SA[i]]; } } static void libsais16x64_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = m; #endif libsais16x64_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size); } } static void libsais16x64_place_lms_suffixes_interval_16u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t flags, sa_sint_t * RESTRICT buckets) { if (flags & LIBSAIS_FLAGS_GSA) { buckets[7 * ALPHABET_SIZE]--; } { const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; fast_sint_t c, j = n; for (c = ALPHABET_SIZE - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } if (flags & LIBSAIS_FLAGS_GSA) { buckets[7 * ALPHABET_SIZE]++; } } static void libsais16x64_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais16x64_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { fast_sint_t j = n; if (k > 1) { fast_sint_t c; for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) { fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)]; if (l > 0) { fast_sint_t i = buckets[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais16x64_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; sa_sint_t c = k - 1; fast_sint_t i, l = buckets[c]; for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4) { libsais16x64_prefetchr(&SA[i - 2 * prefetch_distance]); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 0]]); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 1]]); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 2]]); libsais16x64_prefetchr(&T[SA[i - prefetch_distance - 3]]); sa_sint_t p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p0; sa_sint_t p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p1; sa_sint_t p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p2; sa_sint_t p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p3; } for (; i >= 0; i -= 1) { sa_sint_t p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p; } memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t)); } static void libsais16x64_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { const sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais16x64_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais16x64_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { fast_sint_t j = n; if (k > 1) { fast_sint_t c; for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) { fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)]; if (l > 0) { fast_sint_t i = buckets[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais16x64_final_bwt_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } static void libsais16x64_final_bwt_aux_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]]; }} sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]]; }} } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } } } } static void libsais16x64_final_sorting_scan_left_to_right_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } static void libsais16x64_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 3 * prefetch_distance]); sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais16x64_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais16x64_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16x64_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16x64_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static fast_sint_t libsais16x64_final_bwt_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } return count; } static fast_sint_t libsais16x64_final_sorting_scan_left_to_right_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } return count; } static void libsais16x64_final_order_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais16x64_prefetchr(&cache[i + prefetch_distance]); SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; } for (j += 3; i < j; i += 1) { SA[buckets[cache[i].symbol]++] = cache[i].index; } } static void libsais16x64_final_bwt_aux_scan_left_to_right_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais16x64_prefetchr(&cache[i + prefetch_distance]); SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; if ((cache[i + 0].index & rm) == 0) { I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol]; } SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol]; } SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; if ((cache[i + 2].index & rm) == 0) { I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol]; } SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; if ((cache[i + 3].index & rm) == 0) { I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol]; } } for (j += 3; i < j; i += 1) { SA[buckets[cache[i].symbol]++] = cache[i].index; if ((cache[i].index & rm) == 0) { I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol]; } } } static void libsais16x64_final_sorting_scan_left_to_right_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais16x64_prefetchr(Ts0 - 1); libsais16x64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais16x64_prefetchr(Ts1 - 1); libsais16x64_prefetchr(Ts1 - 2); libsais16x64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; cache[i + 0].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; cache[i + 1].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; cache[i].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol; } } static void libsais16x64_final_sorting_scan_left_to_right_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&cache[i + 2 * prefetch_distance]); sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais16x64_prefetchw(Is0); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais16x64_prefetchw(Is1); sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { cache[i + 0].symbol = induction_bucket[v0]++; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; cache[i + 0].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } sa_sint_t v1 = cache[i + 1].symbol; if (v1 >= 0) { cache[i + 1].symbol = induction_bucket[v1]++; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; cache[i + 1].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = induction_bucket[v]++; if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } } static void libsais16x64_final_bwt_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16x64_final_bwt_scan_left_to_right_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16x64_final_bwt_scan_left_to_right_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16x64_final_order_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16x64_final_bwt_aux_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16x64_final_bwt_aux_scan_left_to_right_16u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16x64_final_bwt_scan_left_to_right_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16x64_final_bwt_aux_scan_left_to_right_16u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16x64_final_sorting_scan_left_to_right_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16x64_final_sorting_scan_left_to_right_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16x64_final_sorting_scan_left_to_right_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16x64_final_order_scan_left_to_right_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16x64_final_sorting_scan_left_to_right_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16x64_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16x64_final_sorting_scan_left_to_right_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais16x64_final_sorting_scan_left_to_right_32s_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16x64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static void libsais16x64_final_bwt_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | (sa_sint_t)((sa_uint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais16x64_final_bwt_scan_left_to_right_16u(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < n; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } else { libsais16x64_final_bwt_scan_left_to_right_16u_block_omp(T, SA, k, induction_bucket, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16x64_final_bwt_aux_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | (sa_sint_t)((sa_uint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); if ((((sa_sint_t)n - 1) & rm) == 0) { I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]]; } if (threads == 1 || n < 65536) { libsais16x64_final_bwt_aux_scan_left_to_right_16u(T, SA, rm, I, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < n; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } } } } else { libsais16x64_final_bwt_aux_scan_left_to_right_16u_block_omp(T, SA, k, rm, I, induction_bucket, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16x64_final_sorting_scan_left_to_right_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | (sa_sint_t)((sa_uint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais16x64_final_sorting_scan_left_to_right_16u(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < n; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } else { libsais16x64_final_sorting_scan_left_to_right_16u_block_omp(T, SA, k, induction_bucket, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16x64_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[n - 1]]++] = (n - 1) | (sa_sint_t)((sa_uint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais16x64_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < n; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } libsais16x64_final_sorting_scan_left_to_right_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static sa_sint_t libsais16x64_final_bwt_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; sa_sint_t index = -1; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; index = (p0 == 0) ? (sa_sint_t)(i - 0) : index; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; } sa_sint_t p1 = SA[i - 1]; index = (p1 == 0) ? (sa_sint_t)(i - 1) : index; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; index = (p == 0) ? (sa_sint_t)i : index; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } } return index; } static void libsais16x64_final_bwt_aux_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1; } } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1; } } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } } } } static void libsais16x64_final_sorting_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } static void libsais16x64_final_gsa_scan_right_to_left_16u(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0 && T[p0 - 1] > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0 && T[p1 - 1] > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0 && T[p - 1] > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } static void libsais16x64_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&SA[i - 3 * prefetch_distance]); sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais16x64_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais16x64_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s2 - 1]]); libsais16x64_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais16x64_prefetchw(&induction_bucket[T[s3 - 1]]); libsais16x64_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static fast_sint_t libsais16x64_final_bwt_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p0 : t; } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p1 : t; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p : t; } } return count; } static fast_sint_t libsais16x64_final_bwt_aux_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint16_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p0 : t; cache[count + 1].index = p0; count += 2; } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint16_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p1 : t; cache[count + 1].index = p1; count += 2; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p : t; cache[count + 1].index = p; count += 2; } } return count; } static fast_sint_t libsais16x64_final_sorting_scan_right_to_left_16u_block_prepare(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint16_t * Ts0 = &T[s0] - 1; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); Ts0--; libsais16x64_prefetchr(s0 > 0 ? Ts0 : NULL); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint16_t * Ts1 = &T[s1] - 1; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); Ts1--; libsais16x64_prefetchr(s1 > 0 ? Ts1 : NULL); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } return count; } static void libsais16x64_final_order_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais16x64_prefetchr(&cache[i + prefetch_distance]); SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index; SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index; } for (j += 3; i < j; i += 1) { SA[--buckets[cache[i].symbol]] = cache[i].index; } } static void libsais16x64_final_gsa_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais16x64_prefetchr(&cache[i + prefetch_distance]); if (cache[i + 0].symbol > 0) { SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; } if (cache[i + 1].symbol > 0) { SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index; } if (cache[i + 2].symbol > 0) { SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; } if (cache[i + 3].symbol > 0) { SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index; } } for (j += 3; i < j; i += 1) { if (cache[i].symbol > 0) { SA[--buckets[cache[i].symbol]] = cache[i].index; } } } static void libsais16x64_final_bwt_aux_scan_right_to_left_16u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 6; i < j; i += 8) { libsais16x64_prefetchr(&cache[i + prefetch_distance]); SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; if ((cache[i + 1].index & rm) == 0) { I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1; } SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; if ((cache[i + 3].index & rm) == 0) { I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1; } SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index; if ((cache[i + 5].index & rm) == 0) { I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1; } SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index; if ((cache[i + 7].index & rm) == 0) { I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1; } } for (j += 6; i < j; i += 2) { SA[--buckets[cache[i].symbol]] = cache[i].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1; } } } static void libsais16x64_final_sorting_scan_right_to_left_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais16x64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais16x64_prefetchr(Ts0 - 1); libsais16x64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais16x64_prefetchr(Ts1 - 1); libsais16x64_prefetchr(Ts1 - 2); libsais16x64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; cache[i + 0].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; cache[i + 1].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; cache[i].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol; } } static void libsais16x64_final_sorting_scan_right_to_left_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais16x64_prefetchw(&cache[i - 2 * prefetch_distance]); sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais16x64_prefetchw(Is0); sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais16x64_prefetchw(Is1); sa_sint_t v0 = cache[i - 0].symbol; if (v0 >= 0) { cache[i - 0].symbol = --induction_bucket[v0]; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; cache[i - 0].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } sa_sint_t v1 = cache[i - 1].symbol; if (v1 >= 0) { cache[i - 1].symbol = --induction_bucket[v1]; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; cache[i - 1].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = --induction_bucket[v]; if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } } static void libsais16x64_final_bwt_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16x64_final_bwt_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16x64_final_bwt_scan_right_to_left_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16x64_final_order_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16x64_final_bwt_aux_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16x64_final_bwt_aux_scan_right_to_left_16u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16x64_final_bwt_aux_scan_right_to_left_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16x64_final_bwt_aux_scan_right_to_left_16u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16x64_final_sorting_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16x64_final_sorting_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16x64_final_sorting_scan_right_to_left_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16x64_final_order_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16x64_final_gsa_scan_right_to_left_16u_block_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16x64_final_gsa_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16x64_final_sorting_scan_right_to_left_16u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais16x64_final_gsa_scan_right_to_left_16u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais16x64_final_sorting_scan_right_to_left_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais16x64_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais16x64_final_sorting_scan_right_to_left_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais16x64_final_sorting_scan_right_to_left_32s_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais16x64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static sa_sint_t libsais16x64_final_bwt_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t index = -1; if (threads == 1 || n < 65536) { index = libsais16x64_final_bwt_scan_right_to_left_16u(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) { if (SA[block_start] == 0) { index = (sa_sint_t)block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < 0) { block_max_end = -1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } } } else { libsais16x64_final_bwt_scan_right_to_left_16u_block_omp(T, SA, k, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif return index; } static void libsais16x64_final_bwt_aux_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { libsais16x64_final_bwt_aux_scan_right_to_left_16u(T, SA, rm, I, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2); if (block_max_end < 0) { block_max_end = -1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint16_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } } } } else { libsais16x64_final_bwt_aux_scan_right_to_left_16u_block_omp(T, SA, k, rm, I, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16x64_final_sorting_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || omp_block_size < 65536) { libsais16x64_final_sorting_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = omp_block_start + omp_block_size - 1; block_start >= omp_block_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < omp_block_start) { block_max_end = omp_block_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } else { libsais16x64_final_sorting_scan_right_to_left_16u_block_omp(T, SA, k, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16x64_final_gsa_scan_right_to_left_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || omp_block_size < 65536) { libsais16x64_final_gsa_scan_right_to_left_16u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = omp_block_start + omp_block_size - 1; block_start >= omp_block_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < omp_block_start) { block_max_end = omp_block_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0 && T[p - 1] > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } else { libsais16x64_final_gsa_scan_right_to_left_16u_block_omp(T, SA, k, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais16x64_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { libsais16x64_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } libsais16x64_final_sorting_scan_right_to_left_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif } static void libsais16x64_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end, sa_sint_t threads) { fast_sint_t c; #if defined(LIBSAIS_OPENMP) #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536) #else UNUSED(threads); UNUSED(n); #endif for (c = 0; c < k; ++c) { if (bucket_end[c] > bucket_start[c]) { memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t)); } } } static sa_sint_t libsais16x64_induce_final_order_16u_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t flags, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if ((flags & LIBSAIS_FLAGS_BWT) == 0) { if (flags & LIBSAIS_FLAGS_GSA) { buckets[6 * ALPHABET_SIZE] = buckets[7 * ALPHABET_SIZE] - 1; } libsais16x64_final_sorting_scan_left_to_right_16u_omp(T, SA, n, k, &buckets[6 * ALPHABET_SIZE], threads, thread_state); if (threads > 1 && n >= 65536) { libsais16x64_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } if (flags & LIBSAIS_FLAGS_GSA) { libsais16x64_flip_suffix_markers_omp(SA, buckets[7 * ALPHABET_SIZE], threads); libsais16x64_final_gsa_scan_right_to_left_16u_omp(T, SA, buckets[7 * ALPHABET_SIZE], (fast_sint_t)n - buckets[7 * ALPHABET_SIZE], k, &buckets[7 * ALPHABET_SIZE], threads, thread_state); } else { libsais16x64_final_sorting_scan_right_to_left_16u_omp(T, SA, 0, n, k, &buckets[7 * ALPHABET_SIZE], threads, thread_state); } return 0; } else if (I != NULL) { libsais16x64_final_bwt_aux_scan_left_to_right_16u_omp(T, SA, n, k, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state); if (threads > 1 && n >= 65536) { libsais16x64_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } libsais16x64_final_bwt_aux_scan_right_to_left_16u_omp(T, SA, n, k, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state); return 0; } else { libsais16x64_final_bwt_scan_left_to_right_16u_omp(T, SA, n, k, &buckets[6 * ALPHABET_SIZE], threads, thread_state); if (threads > 1 && n >= 65536) { libsais16x64_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } return libsais16x64_final_bwt_scan_right_to_left_16u_omp(T, SA, n, k, &buckets[7 * ALPHABET_SIZE], threads, thread_state); } } static void libsais16x64_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16x64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * (fast_sint_t)k], threads, thread_state); libsais16x64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * (fast_sint_t)k], threads, thread_state); } static void libsais16x64_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16x64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * (fast_sint_t)k], threads, thread_state); libsais16x64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * (fast_sint_t)k], threads, thread_state); } static void libsais16x64_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16x64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); libsais16x64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); } static void libsais16x64_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16x64_count_suffixes_32s(T, n, k, buckets); libsais16x64_initialize_buckets_start_32s_1k(k, buckets); libsais16x64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state); libsais16x64_count_suffixes_32s(T, n, k, buckets); libsais16x64_initialize_buckets_end_32s_1k(k, buckets); libsais16x64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state); } static sa_sint_t libsais16x64_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; sa_sint_t i, j; for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3; i < j; i += 4) { libsais16x64_prefetchr(&SA[i + 3 * prefetch_distance]); libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]); libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]); libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]); libsais16x64_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]); sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0]; sa_sint_t * RESTRICT Tq0 = &T[q0]; libsais16x64_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : &SAm[q0 >> 1]); sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1]; sa_sint_t * RESTRICT Tq1 = &T[q1]; libsais16x64_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : &SAm[q1 >> 1]); sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2]; sa_sint_t * RESTRICT Tq2 = &T[q2]; libsais16x64_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : &SAm[q2 >> 1]); sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3]; sa_sint_t * RESTRICT Tq3 = &T[q3]; libsais16x64_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : &SAm[q3 >> 1]); sa_uint_t p0 = (sa_uint_t)SA[i + 0]; sa_sint_t s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= SAINT_MIN; f++; s0 = i + 0 + SAINT_MIN + f; } SAm[p0 >> 1] = s0 - f; sa_uint_t p1 = (sa_uint_t)SA[i + 1]; sa_sint_t s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= SAINT_MIN; f++; s1 = i + 1 + SAINT_MIN + f; } SAm[p1 >> 1] = s1 - f; sa_uint_t p2 = (sa_uint_t)SA[i + 2]; sa_sint_t s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= SAINT_MIN; f++; s2 = i + 2 + SAINT_MIN + f; } SAm[p2 >> 1] = s2 - f; sa_uint_t p3 = (sa_uint_t)SA[i + 3]; sa_sint_t s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= SAINT_MIN; f++; s3 = i + 3 + SAINT_MIN + f; } SAm[p3 >> 1] = s3 - f; } for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1) { sa_uint_t p = (sa_uint_t)SA[i]; sa_sint_t s = SAm[p >> 1]; if (s < 0) { T[p] |= SAINT_MIN; f++; s = i + SAINT_MIN + f; } SAm[p >> 1] = s - f; } return f; } static void libsais16x64_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_uint_t * RESTRICT SAl = (sa_uint_t *)&SA[0]; sa_uint_t * RESTRICT SAr = (sa_uint_t *)&SA[0]; fast_sint_t i, j, l = *pl - 1, r = *pr - 1; for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) { libsais16x64_prefetchr(&SA[i - prefetch_distance]); sa_uint_t p0 = (sa_uint_t)SA[i - 0]; SAl[l] = p0 & SAINT_MAX; l -= (sa_sint_t)p0 < 0; SAr[r] = p0 - 1; r -= (sa_sint_t)p0 > 0; sa_uint_t p1 = (sa_uint_t)SA[i - 1]; SAl[l] = p1 & SAINT_MAX; l -= (sa_sint_t)p1 < 0; SAr[r] = p1 - 1; r -= (sa_sint_t)p1 > 0; sa_uint_t p2 = (sa_uint_t)SA[i - 2]; SAl[l] = p2 & SAINT_MAX; l -= (sa_sint_t)p2 < 0; SAr[r] = p2 - 1; r -= (sa_sint_t)p2 > 0; sa_uint_t p3 = (sa_uint_t)SA[i - 3]; SAl[l] = p3 & SAINT_MAX; l -= (sa_sint_t)p3 < 0; SAr[r] = p3 - 1; r -= (sa_sint_t)p3 > 0; } for (j -= 3; i >= j; i -= 1) { sa_uint_t p = (sa_uint_t)SA[i]; SAl[l] = p & SAINT_MAX; l -= (sa_sint_t)p < 0; SAr[r] = p - 1; r -= (sa_sint_t)p > 0; } *pl = l + 1; *pr = r + 1; } #if defined(LIBSAIS_OPENMP) static sa_sint_t libsais16x64_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16x64_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais16x64_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais16x64_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); libsais16x64_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); f0 += SAm[((sa_uint_t)SA[i + 0]) >> 1] < 0; f1 += SAm[((sa_uint_t)SA[i + 1]) >> 1] < 0; f2 += SAm[((sa_uint_t)SA[i + 2]) >> 1] < 0; f3 += SAm[((sa_uint_t)SA[i + 3]) >> 1] < 0; } for (j += prefetch_distance + 3; i < j; i += 1) { f0 += SAm[((sa_uint_t)SA[i]) >> 1] < 0; } return f0 + f1 + f2 + f3; } #endif static sa_sint_t libsais16x64_renumber_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t f = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { f = libsais16x64_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16x64_count_unique_suffixes(SA, m, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } if (omp_thread_num == omp_num_threads - 1) { f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); } libsais16x64_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); } } #endif } return f; } static void libsais16x64_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072 && m < fs) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; if (omp_num_threads == 1) { fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs; libsais16x64_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size; libsais16x64_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &thread_state[omp_thread_num].state.position, &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t, position; for (position = m, t = omp_num_threads - 1; t >= 0; --t) { fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1); fast_sint_t count = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end - thread_state[t].state.position); if (count > 0) { position -= count; memcpy(&SA[position], &SA[thread_state[t].state.position], (size_t)count * sizeof(sa_sint_t)); } } for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0; --t) { fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1); fast_sint_t count = ((fast_sint_t)m + omp_block_end - thread_state[t].state.count); if (count > 0) { position -= count; memcpy(&SA[position], &SA[thread_state[t].state.count], (size_t)count * sizeof(sa_sint_t)); } } } } #endif } memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t)); } static sa_sint_t libsais16x64_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t f = libsais16x64_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state); libsais16x64_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state); return f; } static void libsais16x64_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l]; sa_sint_t i, j; fast_sint_t tmp = *SAnm++; for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j; i += 4) { libsais16x64_prefetchr(&T[i + prefetch_distance]); sa_sint_t c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & SAINT_MAX; SA[tmp] = i + 0; i++; tmp = *SAnm++; } sa_sint_t c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & SAINT_MAX; SA[tmp] = i + 1; i++; tmp = *SAnm++; } sa_sint_t c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & SAINT_MAX; SA[tmp] = i + 2; i++; tmp = *SAnm++; } sa_sint_t c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & SAINT_MAX; SA[tmp] = i + 3; i++; tmp = *SAnm++; } } for (j += 6; i < j; i += 1) { sa_sint_t c = T[i]; if (c < 0) { T[i] = c & SAINT_MAX; SA[tmp] = i; i++; tmp = *SAnm++; } } } static void libsais16x64_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l]; fast_sint_t i, j; sa_sint_t tmp = *SAnm++; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) { libsais16x64_prefetchr(&SA[i + prefetch_distance]); if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = *SAnm++; } if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = *SAnm++; } if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = *SAnm++; } if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = *SAnm++; } } for (j += 3; i < j; i += 1) { if (SA[i] == 0) { SA[i] = tmp; tmp = *SAnm++; } } } static void libsais16x64_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais16x64_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16x64_count_negative_marked_suffixes(T, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } libsais16x64_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start, omp_block_size); } } #endif } } static void libsais16x64_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { libsais16x64_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais16x64_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = f; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } libsais16x64_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start, omp_block_size); } } #endif } } static void libsais16x64_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais16x64_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state); libsais16x64_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state); } static void libsais16x64_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (f > 0) { memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t)); libsais16x64_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, local_buckets, threads, thread_state); libsais16x64_reconstruct_lms_suffixes_omp(SA, n, m - f, threads); memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t)); memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t)); libsais16x64_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state); } else { libsais16x64_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); libsais16x64_reconstruct_lms_suffixes_omp(SA, n, m, threads); } } static void libsais16x64_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (f > 0) { memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t)); libsais16x64_gather_compacted_lms_suffixes_32s(T, SA, n); libsais16x64_reconstruct_lms_suffixes_omp(SA, n, m - f, threads); memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t)); memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t)); libsais16x64_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state); } else { libsais16x64_gather_lms_suffixes_32s(T, SA, n); libsais16x64_reconstruct_lms_suffixes_omp(SA, n, m, threads); } } static void libsais16x64_convert_32u_to_64u(uint32_t * RESTRICT S, uint64_t * RESTRICT D, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size; i < j; i += 1) { D[i] = (uint64_t)S[i]; } } static void libsais16x64_convert_inplace_32u_to_64u(uint32_t * V, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start; i >= j; i -= 1) { #if defined(__LITTLE_ENDIAN__) V[i + i + 0] = V[i]; V[i + i + 1] = 0; #else V[i + i + 0] = 0; V[i + i + 1] = V[i]; #endif } } static void libsais16x64_convert_inplace_64u_to_32u(uint32_t * V, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size; i < j; i += 1) { #if defined(__LITTLE_ENDIAN__) V[i] = V[i + i + 0]; #else V[i] = V[i + i + 1]; #endif } } static void libsais16x64_convert_inplace_32u_to_64u_omp(uint32_t * V, sa_sint_t n, sa_sint_t threads) { while (n >= 65536) { fast_sint_t block_size = n >> 1; n -= block_size; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; libsais16x64_convert_32u_to_64u(((uint32_t *)(void *)V) + n, ((uint64_t *)(void *)V) + n, omp_block_start, omp_block_size); } } libsais16x64_convert_inplace_32u_to_64u(V, 0, n); } static sa_sint_t libsais16x64_main_32s_recursion(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state, sa_sint_t * RESTRICT local_buffer) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; if ((new_fs / k >= 6) || (new_fs / k >= 4 && n <= INT32_MAX / 2) || (new_fs / k < 4 && new_fs >= fs)) { libsais16x64_convert_inplace_64u_to_32u((uint32_t *)(void *)T, 0, n); #if defined(LIBSAIS_OPENMP) sa_sint_t index = libsais16_int_omp((int32_t *)T, (int32_t *)SA, (int32_t)n, (int32_t)k, (int32_t)new_fs, (int32_t)threads); #else sa_sint_t index = libsais16_int((int32_t *)T, (int32_t *)SA, (int32_t)n, (int32_t)k, (int32_t)new_fs); #endif if (index >= 0) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, threads); libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)T, n, threads); } return index; } } if (k > 0 && ((fs / k >= 6) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 6))) { sa_sint_t alignment = (fs - 1024) / k >= 6 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais16x64_align_up(&SA[n + fs - 6 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * (fast_sint_t)k]; buckets = (LIBSAIS_LOCAL_BUFFER_SIZE > fs) ? local_buffer : buckets; sa_sint_t m = libsais16x64_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, buckets == local_buffer, threads, thread_state); if (m > 1) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); sa_sint_t first_lms_suffix = SA[n - m]; sa_sint_t left_suffixes_count = libsais16x64_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix); libsais16x64_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * (fast_sint_t)k], threads, thread_state); if ((n / 8192) < k) { libsais16x64_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * (fast_sint_t)k], threads); } if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } libsais16x64_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count); libsais16x64_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state); sa_sint_t names = (n / 8192) < k ? libsais16x64_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state) : libsais16x64_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state); if (names < m) { sa_sint_t f = (n / 8192) < k ? libsais16x64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state) : 0; if (libsais16x64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais16x64_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, buckets == local_buffer, threads, thread_state); } else { libsais16x64_count_lms_suffixes_32s_2k(T, n, k, buckets); } libsais16x64_initialize_buckets_start_and_end_32s_4k(k, buckets); libsais16x64_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); libsais16x64_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state); } else { SA[0] = SA[n - 1]; libsais16x64_initialize_buckets_start_and_end_32s_6k(k, buckets); libsais16x64_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets); libsais16x64_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state); } return 0; } else if (k > 0 && (n <= SAINT_MAX / 2) && ((fs / k >= 4) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 4))) { sa_sint_t alignment = (fs - 1024) / k >= 4 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais16x64_align_up(&SA[n + fs - 4 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * (fast_sint_t)k]; buckets = (LIBSAIS_LOCAL_BUFFER_SIZE > fs) ? local_buffer : buckets; sa_sint_t m = libsais16x64_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, buckets == local_buffer, threads, thread_state); if (m > 1) { libsais16x64_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]); libsais16x64_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state); libsais16x64_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads); libsais16x64_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets); libsais16x64_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); sa_sint_t names = libsais16x64_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state); if (names < m) { sa_sint_t f = libsais16x64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); if (libsais16x64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais16x64_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, buckets == local_buffer, threads, thread_state); } else { libsais16x64_count_lms_suffixes_32s_2k(T, n, k, buckets); } } else { SA[0] = SA[n - 1]; } libsais16x64_initialize_buckets_start_and_end_32s_4k(k, buckets); libsais16x64_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); libsais16x64_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state); return 0; } else if (k > 0 && ((fs / k >= 2) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 2))) { sa_sint_t alignment = (fs - 1024) / k >= 2 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais16x64_align_up(&SA[n + fs - 2 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * (fast_sint_t)k]; buckets = (LIBSAIS_LOCAL_BUFFER_SIZE > fs) ? local_buffer : buckets; sa_sint_t m = libsais16x64_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, buckets == local_buffer, threads, thread_state); if (m > 1) { libsais16x64_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]); libsais16x64_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state); libsais16x64_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets); libsais16x64_initialize_buckets_start_and_end_32s_2k(k, buckets); libsais16x64_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); sa_sint_t names = libsais16x64_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads); if (names < m) { sa_sint_t f = libsais16x64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); if (libsais16x64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais16x64_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, buckets == local_buffer, threads, thread_state); } else { libsais16x64_count_lms_suffixes_32s_2k(T, n, k, buckets); } } else { SA[0] = SA[n - 1]; } libsais16x64_initialize_buckets_end_32s_2k(k, buckets); libsais16x64_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets); libsais16x64_initialize_buckets_start_and_end_32s_2k(k, buckets); libsais16x64_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state); return 0; } else { sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais16x64_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL; sa_sint_t alignment = fs - 1024 >= k ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais16x64_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer; if (buckets == NULL) { return -2; } memset(SA, 0, (size_t)n * sizeof(sa_sint_t)); libsais16x64_count_suffixes_32s(T, n, k, buckets); libsais16x64_initialize_buckets_end_32s_1k(k, buckets); sa_sint_t m = libsais16x64_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets); if (m > 1) { libsais16x64_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state); sa_sint_t names = libsais16x64_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads); if (names < m) { if (buffer != NULL) { libsais16x64_free_aligned(buffer); buckets = NULL; } sa_sint_t f = libsais16x64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); if (libsais16x64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais16x64_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state); if (buckets == NULL) { buckets = buffer = (sa_sint_t *)libsais16x64_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096); } if (buckets == NULL) { return -2; } } libsais16x64_count_suffixes_32s(T, n, k, buckets); libsais16x64_initialize_buckets_end_32s_1k(k, buckets); libsais16x64_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets); } libsais16x64_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state); libsais16x64_free_aligned(buffer); return 0; } } static sa_sint_t libsais16x64_main_32s_entry(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t local_buffer[2 * LIBSAIS_LOCAL_BUFFER_SIZE]; return libsais16x64_main_32s_recursion(T, SA, n, k, fs, threads, thread_state, local_buffer + LIBSAIS_LOCAL_BUFFER_SIZE); } static sa_sint_t libsais16x64_main_16u(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t flags, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); sa_sint_t m = libsais16x64_count_and_gather_lms_suffixes_16u_omp(T, SA, n, buckets, threads, thread_state); sa_sint_t k = libsais16x64_initialize_buckets_start_and_end_16u(buckets, freq); if ((flags & LIBSAIS_FLAGS_GSA) && (buckets[0] != 0 || buckets[2] != 0 || buckets[3] != 1)) { return -1; } if (m > 0) { sa_sint_t first_lms_suffix = SA[n - m]; sa_sint_t left_suffixes_count = libsais16x64_initialize_buckets_for_lms_suffixes_radix_sort_16u(T, buckets, first_lms_suffix); if (threads > 1 && n >= 65536) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); } libsais16x64_radix_sort_lms_suffixes_16u_omp(T, SA, n, m, flags, buckets, threads, thread_state); if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } libsais16x64_initialize_buckets_for_partial_sorting_16u(T, buckets, first_lms_suffix, left_suffixes_count); libsais16x64_induce_partial_order_16u_omp(T, SA, n, k, flags, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state); sa_sint_t names = libsais16x64_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state); if (names < m) { if (libsais16x64_main_32s_entry(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) { return -2; } libsais16x64_gather_lms_suffixes_16u_omp(T, SA, n, threads, thread_state); libsais16x64_reconstruct_lms_suffixes_omp(SA, n, m, threads); } libsais16x64_place_lms_suffixes_interval_16u(SA, n, m, flags, buckets); } else { memset(SA, 0, (size_t)n * sizeof(sa_sint_t)); } return libsais16x64_induce_final_order_16u_omp(T, SA, n, k, flags, r, I, buckets, threads, thread_state); } static sa_sint_t libsais16x64_main(const uint16_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t flags, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais16x64_alloc_thread_state(threads) : NULL; sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais16x64_alloc_aligned((size_t)8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1) ? libsais16x64_main_16u(T, SA, n, buckets, flags, r, I, fs, freq, threads, thread_state) : -2; libsais16x64_free_aligned(buckets); libsais16x64_free_thread_state(thread_state); return index; } static sa_sint_t libsais16x64_main_long(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais16x64_alloc_thread_state(threads) : NULL; sa_sint_t index = thread_state != NULL || threads == 1 ? libsais16x64_main_32s_entry(T, SA, n, k, fs, threads, thread_state) : -2; libsais16x64_free_thread_state(thread_state); return index; } static void libsais16x64_bwt_copy_16u(uint16_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) { libsais16x64_prefetchr(&A[i + prefetch_distance]); U[i + 0] = (uint16_t)A[i + 0]; U[i + 1] = (uint16_t)A[i + 1]; U[i + 2] = (uint16_t)A[i + 2]; U[i + 3] = (uint16_t)A[i + 3]; U[i + 4] = (uint16_t)A[i + 4]; U[i + 5] = (uint16_t)A[i + 5]; U[i + 6] = (uint16_t)A[i + 6]; U[i + 7] = (uint16_t)A[i + 7]; } for (j += 7; i < j; i += 1) { U[i] = (uint16_t)A[i]; } } static void libsais16x64_bwt_copy_16u_omp(uint16_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = ((fast_sint_t)n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)n - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)n; #endif libsais16x64_bwt_copy_16u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size); } } int64_t libsais16x64(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais16(T, (int32_t *)SA, (int32_t)n, (int32_t)new_fs, (int32_t *)freq); if (index >= 0) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, 1); if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); } } return index; } return libsais16x64_main(T, SA, n, LIBSAIS_FLAGS_NONE, 0, NULL, fs, freq, 1); } int64_t libsais16x64_gsa(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq) { if ((T == NULL) || (SA == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais16_gsa(T, (int32_t *)SA, (int32_t)n, (int32_t)new_fs, (int32_t *)freq); if (index >= 0) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, 1); if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); } } return index; } return libsais16x64_main(T, SA, n, LIBSAIS_FLAGS_GSA, 0, NULL, fs, freq, 1); } int64_t libsais16x64_long(int64_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; } return libsais16x64_main_long(T, SA, n, k, fs, 1); } int64_t libsais16x64_bwt(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } return n; } if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais16_bwt(T, U, (int32_t *)A, (int32_t)n, (int32_t)new_fs, (int32_t *)freq); if (index >= 0) { if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); } } return index; } sa_sint_t index = libsais16x64_main(T, A, n, LIBSAIS_FLAGS_BWT, 0, NULL, fs, freq, 1); if (index >= 0) { index++; U[0] = T[n - 1]; libsais16x64_bwt_copy_16u_omp(U + 1, A, index - 1, 1); libsais16x64_bwt_copy_16u_omp(U + index, A + index, n - index, 1); } return index; } int64_t libsais16x64_bwt_aux(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } I[0] = n; return 0; } if (n <= INT32_MAX && r <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais16_bwt_aux(T, U, (int32_t *)A, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)r, (int32_t *)I); if (index >= 0) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)I, 1 + ((n - 1) / r), 1); if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); } } return index; } sa_sint_t index = libsais16x64_main(T, A, n, LIBSAIS_FLAGS_BWT, r, I, fs, freq, 1); if (index == 0) { U[0] = T[n - 1]; libsais16x64_bwt_copy_16u_omp(U + 1, A, I[0] - 1, 1); libsais16x64_bwt_copy_16u_omp(U + I[0], A + I[0], n - I[0], 1); } return index; } #if defined(LIBSAIS_OPENMP) int64_t libsais16x64_omp(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq, int64_t threads) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais16_omp(T, (int32_t *)SA, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)threads); if (index >= 0) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, threads); if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, threads); } } return index; } return libsais16x64_main(T, SA, n, LIBSAIS_FLAGS_NONE, 0, NULL, fs, freq, threads); } int64_t libsais16x64_gsa_omp(const uint16_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq, int64_t threads) { if ((T == NULL) || (SA == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais16_gsa_omp(T, (int32_t *)SA, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)threads); if (index >= 0) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, threads); if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, threads); } } return index; } return libsais16x64_main(T, SA, n, LIBSAIS_FLAGS_GSA, 0, NULL, fs, freq, threads); } int64_t libsais16x64_long_omp(int64_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs, int64_t threads) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return libsais16x64_main_long(T, SA, n, k, fs, threads); } int64_t libsais16x64_bwt_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t threads) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } return n; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais16_bwt_omp(T, U, (int32_t *)A, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)threads); if (index >= 0) { if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, threads); } } return index; } sa_sint_t index = libsais16x64_main(T, A, n, LIBSAIS_FLAGS_BWT, 0, NULL, fs, freq, threads); if (index >= 0) { index++; U[0] = T[n - 1]; libsais16x64_bwt_copy_16u_omp(U + 1, A, index - 1, threads); libsais16x64_bwt_copy_16u_omp(U + index, A + index, n - index, threads); } return index; } int64_t libsais16x64_bwt_aux_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I, int64_t threads) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } I[0] = n; return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; if (n <= INT32_MAX && r <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais16_bwt_aux_omp(T, U, (int32_t *)A, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)r, (int32_t *)I, (int32_t)threads); if (index >= 0) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)I, 1 + ((n - 1) / r), threads); if (freq != NULL) { libsais16x64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, threads); } } return index; } sa_sint_t index = libsais16x64_main(T, A, n, LIBSAIS_FLAGS_BWT, r, I, fs, freq, threads); if (index == 0) { U[0] = T[n - 1]; libsais16x64_bwt_copy_16u_omp(U + 1, A, I[0] - 1, threads); libsais16x64_bwt_copy_16u_omp(U + I[0], A + I[0], n - I[0], threads); } return index; } #endif static void libsais16x64_unbwt_compute_histogram(const uint16_t * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count) { fast_sint_t i; for (i = 0; i < n; i += 1) { count[T[i]]++; } } static void libsais16x64_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift) { fast_uint_t v, w, sum; for (v = 0, sum = 1, w = 0; w < ALPHABET_SIZE; ++w) { fast_uint_t prev = sum; sum += bucket2[w]; bucket2[w] = (sa_uint_t)prev; if (prev != sum) { for (; v <= ((sum - 1) >> shift); ++v) { fastbits[v] = (uint16_t)w; } } } } static void libsais16x64_unbwt_calculate_P(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start, fast_sint_t omp_block_end) { { fast_sint_t i = omp_block_start, j = (fast_sint_t)index; if (omp_block_end < j) { j = omp_block_end; } for (; i < j; ++i) { fast_uint_t c = T[i]; P[bucket2[c]++] = (sa_uint_t)i; } } { fast_sint_t i = (fast_sint_t)index, j = omp_block_end; if (omp_block_start > i) { i = omp_block_start; } for (T -= 1, i += 1; i <= j; ++i) { fast_uint_t c = T[i]; P[bucket2[c]++] = (sa_uint_t)i; } } } static void libsais16x64_unbwt_init_single(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits) { fast_uint_t index = I[0]; fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } if (freq != NULL) { memcpy(bucket2, freq, ALPHABET_SIZE * sizeof(sa_uint_t)); } else { memset(bucket2, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); libsais16x64_unbwt_compute_histogram(T, n, bucket2); } libsais16x64_unbwt_calculate_fastbits(bucket2, fastbits, shift); libsais16x64_unbwt_calculate_P(T, P, bucket2, index, 0, n); } #if defined(LIBSAIS_OPENMP) static void libsais16x64_unbwt_init_parallel(const uint16_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads) { fast_uint_t index = I[0]; fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) { fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); if (omp_num_threads == 1) { libsais16x64_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); } else { { sa_uint_t * RESTRICT bucket2_local = buckets + omp_thread_num * ALPHABET_SIZE; fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; memset(bucket2_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); libsais16x64_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket2_local); } #pragma omp barrier { sa_uint_t * RESTRICT bucket2_temp = buckets; fast_sint_t omp_block_stride = (ALPHABET_SIZE / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ALPHABET_SIZE - omp_block_start; memset(bucket2 + omp_block_start, 0, (size_t)omp_block_size * sizeof(sa_uint_t)); fast_sint_t t; for (t = 0; t < omp_num_threads; ++t, bucket2_temp += ALPHABET_SIZE) { fast_sint_t c; for (c = omp_block_start; c < omp_block_start + omp_block_size; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_temp[c]; bucket2[c] = A + B; bucket2_temp[c] = A; } } } #pragma omp barrier #pragma omp master { libsais16x64_unbwt_calculate_fastbits(bucket2, fastbits, shift); } #pragma omp barrier { sa_uint_t * RESTRICT bucket2_local = buckets + omp_thread_num * ALPHABET_SIZE; fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_local[c]; bucket2_local[c] = A + B; } libsais16x64_unbwt_calculate_P(T, P, bucket2_local, index, omp_block_start, omp_block_start + omp_block_size); } #pragma omp barrier #pragma omp master { memcpy(bucket2, buckets + (omp_num_threads - 1) * ALPHABET_SIZE, ALPHABET_SIZE * sizeof(sa_uint_t)); } } } } #endif static void libsais16x64_unbwt_decode_1(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k) { uint16_t * RESTRICT U0 = U; fast_uint_t i, p0 = *i0; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; } *i0 = p0; } static void libsais16x64_unbwt_decode_2(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; fast_uint_t i, p0 = *i0, p1 = *i1; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; } *i0 = p0; *i1 = p1; } static void libsais16x64_unbwt_decode_3(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; uint16_t * RESTRICT U2 = U1 + r; fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; } *i0 = p0; *i1 = p1; *i2 = p2; } static void libsais16x64_unbwt_decode_4(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; uint16_t * RESTRICT U2 = U1 + r; uint16_t * RESTRICT U3 = U2 + r; fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; } static void libsais16x64_unbwt_decode_5(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; uint16_t * RESTRICT U2 = U1 + r; uint16_t * RESTRICT U3 = U2 + r; uint16_t * RESTRICT U4 = U3 + r; fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4; } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; } static void libsais16x64_unbwt_decode_6(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; uint16_t * RESTRICT U2 = U1 + r; uint16_t * RESTRICT U3 = U2 + r; uint16_t * RESTRICT U4 = U3 + r; uint16_t * RESTRICT U5 = U4 + r; fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4; uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5; } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; } static void libsais16x64_unbwt_decode_7(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; uint16_t * RESTRICT U2 = U1 + r; uint16_t * RESTRICT U3 = U2 + r; uint16_t * RESTRICT U4 = U3 + r; uint16_t * RESTRICT U5 = U4 + r; uint16_t * RESTRICT U6 = U5 + r; fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4; uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5; uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = c6; } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; } static void libsais16x64_unbwt_decode_8(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k) { uint16_t * RESTRICT U0 = U; uint16_t * RESTRICT U1 = U0 + r; uint16_t * RESTRICT U2 = U1 + r; uint16_t * RESTRICT U3 = U2 + r; uint16_t * RESTRICT U4 = U3 + r; uint16_t * RESTRICT U5 = U4 + r; uint16_t * RESTRICT U6 = U5 + r; uint16_t * RESTRICT U7 = U6 + r; fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = c0; uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = c1; uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = c2; uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = c3; uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = c4; uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = c5; uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = c6; uint16_t c7 = fastbits[p7 >> shift]; if (bucket2[c7] <= p7) { do { c7++; } while (bucket2[c7] <= p7); } p7 = P[p7]; U7[i] = c7; } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; *i7 = p7; } static void libsais16x64_unbwt_decode(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t remainder) { fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } fast_uint_t offset = 0; while (blocks > 8) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7]; libsais16x64_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r); I += 8; blocks -= 8; offset += 8 * (fast_uint_t)r; } if (blocks == 1) { fast_uint_t i0 = I[0]; libsais16x64_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, remainder); } else if (blocks == 2) { fast_uint_t i0 = I[0], i1 = I[1]; libsais16x64_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, remainder); libsais16x64_unbwt_decode_1(U + offset + remainder, P, bucket2, fastbits, shift, &i0, ((fast_uint_t)r) - remainder); } else if (blocks == 3) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2]; libsais16x64_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, remainder); libsais16x64_unbwt_decode_2(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r) - remainder); } else if (blocks == 4) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3]; libsais16x64_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, remainder); libsais16x64_unbwt_decode_3(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, ((fast_uint_t)r) - remainder); } else if (blocks == 5) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4]; libsais16x64_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, remainder); libsais16x64_unbwt_decode_4(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, ((fast_uint_t)r) - remainder); } else if (blocks == 6) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5]; libsais16x64_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, remainder); libsais16x64_unbwt_decode_5(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, ((fast_uint_t)r) - remainder); } else if (blocks == 7) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6]; libsais16x64_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, remainder); libsais16x64_unbwt_decode_6(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, ((fast_uint_t)r) - remainder); } else { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7]; libsais16x64_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, remainder); libsais16x64_unbwt_decode_7(U + offset + remainder, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r) - remainder); } } static void libsais16x64_unbwt_decode_omp(uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_sint_t threads) { fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r); fast_uint_t remainder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1)); #if defined(LIBSAIS_OPENMP) fast_sint_t max_threads = blocks < threads ? blocks : threads; #pragma omp parallel num_threads(max_threads) if(max_threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = blocks / omp_num_threads; fast_sint_t omp_block_remainder = blocks % omp_num_threads; fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_remainder); fast_sint_t omp_block_start = omp_block_stride * omp_thread_num + (omp_thread_num < omp_block_remainder ? omp_thread_num : omp_block_remainder); libsais16x64_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size, omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : remainder); } } static sa_sint_t libsais16x64_unbwt_core(const uint16_t * RESTRICT T, uint16_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) if (threads > 1 && n >= 262144) { libsais16x64_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads); } else #else UNUSED(buckets); #endif { libsais16x64_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); } libsais16x64_unbwt_decode_omp(U, P, n, r, I, bucket2, fastbits, threads); return 0; } static sa_sint_t libsais16x64_unbwt_main(const uint16_t * T, uint16_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads) { fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais16x64_alloc_aligned(ALPHABET_SIZE * sizeof(sa_uint_t), 4096); uint16_t * RESTRICT fastbits = (uint16_t *)libsais16x64_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096); sa_uint_t * RESTRICT buckets = threads > 1 && n >= 262144 ? (sa_uint_t *)libsais16x64_alloc_aligned((size_t)threads * ALPHABET_SIZE * sizeof(sa_uint_t), 4096) : NULL; sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144) ? libsais16x64_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads) : -2; libsais16x64_free_aligned(buckets); libsais16x64_free_aligned(fastbits); libsais16x64_free_aligned(bucket2); return index; } int64_t libsais16x64_unbwt(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i) { return libsais16x64_unbwt_aux(T, U, A, n, freq, n, &i); } int64_t libsais16x64_unbwt_aux(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) { return -1; } else if (n <= 1) { if (I[0] != n) { return -1; } if (n == 1) { U[0] = T[0]; } return 0; } fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } if (n <= INT32_MAX && r <= INT32_MAX && (n - 1) / r < 1024) { int32_t indexes[1024]; for (t = 0; t <= (n - 1) / r; ++t) { indexes[t] = (int32_t)I[t]; } return libsais16_unbwt_aux(T, U, (int32_t *)A, (int32_t)n, NULL, (int32_t)r, indexes); } return libsais16x64_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1); } #if defined(LIBSAIS_OPENMP) int64_t libsais16x64_unbwt_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i, int64_t threads) { return libsais16x64_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads); } int64_t libsais16x64_unbwt_aux_omp(const uint16_t * T, uint16_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I, int64_t threads) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0)) { return -1; } else if (n <= 1) { if (I[0] != n) { return -1; } if (n == 1) { U[0] = T[0]; } return 0; } fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } if (n <= INT32_MAX && r <= INT32_MAX && (n - 1) / r < 1024) { int32_t indexes[1024]; for (t = 0; t <= (n - 1) / r; ++t) { indexes[t] = (int32_t)I[t]; } return libsais16_unbwt_aux_omp(T, U, (int32_t *)A, (int32_t)n, NULL,(int32_t)r, indexes, (int32_t)threads); } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return libsais16x64_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads); } #endif static void libsais16x64_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16x64_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]); libsais16x64_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]); PLCP[SA[i + 0]] = k; k = SA[i + 0]; PLCP[SA[i + 1]] = k; k = SA[i + 1]; libsais16x64_prefetchw(&PLCP[SA[i + prefetch_distance + 2]]); libsais16x64_prefetchw(&PLCP[SA[i + prefetch_distance + 3]]); PLCP[SA[i + 2]] = k; k = SA[i + 2]; PLCP[SA[i + 3]] = k; k = SA[i + 3]; } for (j += prefetch_distance + 3; i < j; i += 1) { PLCP[SA[i]] = k; k = SA[i]; } } static void libsais16x64_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais16x64_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size); } } static void libsais16x64_compute_plcp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) { libsais16x64_prefetchw(&PLCP[i + 2 * prefetch_distance]); libsais16x64_prefetchr(&T[PLCP[i + prefetch_distance] + l]); fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); while (l < m && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } for (j += prefetch_distance; i < j; i += 1) { fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); while (l < m && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } } static void libsais16x64_compute_plcp_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais16x64_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size); } } static void libsais16x64_compute_plcp_gsa(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) { libsais16x64_prefetchw(&PLCP[i + 2 * prefetch_distance]); libsais16x64_prefetchr(&T[PLCP[i + prefetch_distance] + l]); fast_sint_t k = PLCP[i]; while (T[i + l] > 0 && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } for (j += prefetch_distance; i < j; i += 1) { fast_sint_t k = PLCP[i]; while (T[i + l] > 0 && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } } static void libsais16x64_compute_plcp_gsa_omp(const uint16_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais16x64_compute_plcp_gsa(T, PLCP, omp_block_start, omp_block_size); } } static void libsais16x64_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais16x64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais16x64_prefetchw(&LCP[i + prefetch_distance]); libsais16x64_prefetchr(&PLCP[SA[i + prefetch_distance + 0]]); libsais16x64_prefetchr(&PLCP[SA[i + prefetch_distance + 1]]); LCP[i + 0] = PLCP[SA[i + 0]]; LCP[i + 1] = PLCP[SA[i + 1]]; libsais16x64_prefetchr(&PLCP[SA[i + prefetch_distance + 2]]); libsais16x64_prefetchr(&PLCP[SA[i + prefetch_distance + 3]]); LCP[i + 2] = PLCP[SA[i + 2]]; LCP[i + 3] = PLCP[SA[i + 3]]; } for (j += prefetch_distance + 3; i < j; i += 1) { LCP[i] = PLCP[SA[i]]; } } static void libsais16x64_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais16x64_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size); } } int64_t libsais16x64_plcp(const uint16_t * T, const int64_t * SA, int64_t * PLCP, int64_t n) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } libsais16x64_compute_phi_omp(SA, PLCP, n, 1); libsais16x64_compute_plcp_omp(T, PLCP, n, 1); return 0; } int64_t libsais16x64_plcp_gsa(const uint16_t * T, const int64_t * SA, int64_t * PLCP, int64_t n) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } libsais16x64_compute_phi_omp(SA, PLCP, n, 1); libsais16x64_compute_plcp_gsa_omp(T, PLCP, n, 1); return 0; } int64_t libsais16x64_lcp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n) { if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0)) { return -1; } else if (n <= 1) { if (n == 1) { LCP[0] = PLCP[SA[0]]; } return 0; } libsais16x64_compute_lcp_omp(PLCP, SA, LCP, n, 1); return 0; } #if defined(LIBSAIS_OPENMP) int64_t libsais16x64_plcp_omp(const uint16_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; libsais16x64_compute_phi_omp(SA, PLCP, n, threads); libsais16x64_compute_plcp_omp(T, PLCP, n, threads); return 0; } int64_t libsais16x64_plcp_gsa_omp(const uint16_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; libsais16x64_compute_phi_omp(SA, PLCP, n, threads); libsais16x64_compute_plcp_gsa_omp(T, PLCP, n, threads); return 0; } int64_t libsais16x64_lcp_omp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n, int64_t threads) { if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { LCP[0] = PLCP[SA[0]]; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; libsais16x64_compute_lcp_omp(PLCP, SA, LCP, n, threads); return 0; } #endif ================================================ FILE: src/libsais64.c ================================================ /*-- This file is a part of libsais, a library for linear time suffix array, longest common prefix array and burrows wheeler transform construction. Copyright (c) 2021-2025 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. Please see the file LICENSE for full copyright information. --*/ #include "libsais.h" #include "libsais64.h" #include #include #include #include #include #if defined(LIBSAIS_OPENMP) #include #else #define UNUSED(_x) (void)(_x) #endif typedef int64_t sa_sint_t; typedef uint64_t sa_uint_t; typedef int64_t fast_sint_t; typedef uint64_t fast_uint_t; #define SAINT_BIT (64) #define SAINT_MAX INT64_MAX #define SAINT_MIN INT64_MIN #define ALPHABET_SIZE (1 << CHAR_BIT) #define UNBWT_FASTBITS (17) #define SUFFIX_GROUP_BIT (SAINT_BIT - 1) #define SUFFIX_GROUP_MARKER (((sa_sint_t)1) << (SUFFIX_GROUP_BIT - 1)) #define BUCKETS_INDEX2(_c, _s) ((((fast_sint_t)_c) << 1) + (fast_sint_t)(_s)) #define BUCKETS_INDEX4(_c, _s) ((((fast_sint_t)_c) << 2) + (fast_sint_t)(_s)) #define LIBSAIS_LOCAL_BUFFER_SIZE (1000) #define LIBSAIS_PER_THREAD_CACHE_SIZE (24576) #define LIBSAIS_FLAGS_NONE (0) #define LIBSAIS_FLAGS_BWT (1) #define LIBSAIS_FLAGS_GSA (2) typedef struct LIBSAIS_THREAD_CACHE { sa_sint_t symbol; sa_sint_t index; } LIBSAIS_THREAD_CACHE; typedef union LIBSAIS_THREAD_STATE { struct { fast_sint_t position; fast_sint_t count; fast_sint_t m; fast_sint_t last_lms_suffix; sa_sint_t * buckets; LIBSAIS_THREAD_CACHE * cache; } state; uint8_t padding[64]; } LIBSAIS_THREAD_STATE; typedef struct LIBSAIS_CONTEXT { sa_sint_t * buckets; LIBSAIS_THREAD_STATE * thread_state; fast_sint_t threads; } LIBSAIS_CONTEXT; typedef struct LIBSAIS_UNBWT_CONTEXT { sa_uint_t * bucket2; uint16_t * fastbits; sa_uint_t * buckets; fast_sint_t threads; } LIBSAIS_UNBWT_CONTEXT; #if defined(__GNUC__) || defined(__clang__) #define RESTRICT __restrict__ #elif defined(_MSC_VER) || defined(__INTEL_COMPILER) #define RESTRICT __restrict #else #error Your compiler, configuration or platform is not supported. #endif #if defined(__has_builtin) #if __has_builtin(__builtin_prefetch) #define HAS_BUILTIN_PREFETCH #endif #elif defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ >= 2)) || (__GNUC__ >= 4)) #define HAS_BUILTIN_PREFETCH #endif #if defined(__has_builtin) #if __has_builtin(__builtin_bswap16) #define HAS_BUILTIN_BSWAP16 #endif #elif defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) || (__GNUC__ >= 5)) #define HAS_BUILTIN_BSWAP16 #endif #if defined(HAS_BUILTIN_PREFETCH) #define libsais64_prefetchr(address) __builtin_prefetch((const void *)(address), 0, 3) #define libsais64_prefetchw(address) __builtin_prefetch((const void *)(address), 1, 3) #elif defined (_M_IX86) || defined (_M_AMD64) #include #define libsais64_prefetchr(address) _mm_prefetch((const void *)(address), _MM_HINT_T0) #define libsais64_prefetchw(address) _m_prefetchw((const void *)(address)) #elif defined (_M_ARM) #include #define libsais64_prefetchr(address) __prefetch((const void *)(address)) #define libsais64_prefetchw(address) __prefetchw((const void *)(address)) #elif defined (_M_ARM64) #include #define libsais64_prefetchr(address) __prefetch2((const void *)(address), 0) #define libsais64_prefetchw(address) __prefetch2((const void *)(address), 16) #else #error Your compiler, configuration or platform is not supported. #endif #if !defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) #if defined(_LITTLE_ENDIAN) \ || (defined(BYTE_ORDER) && defined(LITTLE_ENDIAN) && BYTE_ORDER == LITTLE_ENDIAN) \ || (defined(_BYTE_ORDER) && defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN) \ || (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) \ || (defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) #define __LITTLE_ENDIAN__ #elif defined(_BIG_ENDIAN) \ || (defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN) \ || (defined(_BYTE_ORDER) && defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN) \ || (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) \ || (defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) #define __BIG_ENDIAN__ #elif defined(_WIN32) #define __LITTLE_ENDIAN__ #endif #endif #if defined(__LITTLE_ENDIAN__) && !defined(__BIG_ENDIAN__) #if defined(HAS_BUILTIN_BSWAP16) #define libsais64_bswap16(x) (__builtin_bswap16(x)) #elif defined(_MSC_VER) && !defined(__INTEL_COMPILER) #define libsais64_bswap16(x) (_byteswap_ushort(x)) #else #define libsais64_bswap16(x) ((uint16_t)(x >> 8) | (uint16_t)(x << 8)) #endif #elif !defined(__LITTLE_ENDIAN__) && defined(__BIG_ENDIAN__) #define libsais64_bswap16(x) (x) #else #error Your compiler, configuration or platform is not supported. #endif static void * libsais64_align_up(const void * address, size_t alignment) { return (void *)((((ptrdiff_t)address) + ((ptrdiff_t)alignment) - 1) & (-((ptrdiff_t)alignment))); } static void * libsais64_alloc_aligned(size_t size, size_t alignment) { void * address = malloc(size + sizeof(short) + alignment - 1); if (address != NULL) { void * aligned_address = libsais64_align_up((void *)((ptrdiff_t)address + (ptrdiff_t)(sizeof(short))), alignment); ((short *)aligned_address)[-1] = (short)((ptrdiff_t)aligned_address - (ptrdiff_t)address); return aligned_address; } return NULL; } static void libsais64_free_aligned(void * aligned_address) { if (aligned_address != NULL) { free((void *)((ptrdiff_t)aligned_address - ((short *)aligned_address)[-1])); } } static LIBSAIS_THREAD_STATE * libsais64_alloc_thread_state(sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = (LIBSAIS_THREAD_STATE *)libsais64_alloc_aligned((size_t)threads * sizeof(LIBSAIS_THREAD_STATE), 4096); sa_sint_t * RESTRICT thread_buckets = (sa_sint_t *)libsais64_alloc_aligned((size_t)threads * 4 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); LIBSAIS_THREAD_CACHE * RESTRICT thread_cache = (LIBSAIS_THREAD_CACHE *)libsais64_alloc_aligned((size_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE * sizeof(LIBSAIS_THREAD_CACHE), 4096); if (thread_state != NULL && thread_buckets != NULL && thread_cache != NULL) { fast_sint_t t; for (t = 0; t < threads; ++t) { thread_state[t].state.buckets = thread_buckets; thread_buckets += 4 * ALPHABET_SIZE; thread_state[t].state.cache = thread_cache; thread_cache += LIBSAIS_PER_THREAD_CACHE_SIZE; } return thread_state; } libsais64_free_aligned(thread_cache); libsais64_free_aligned(thread_buckets); libsais64_free_aligned(thread_state); return NULL; } static void libsais64_free_thread_state(LIBSAIS_THREAD_STATE * thread_state) { if (thread_state != NULL) { libsais64_free_aligned(thread_state[0].state.cache); libsais64_free_aligned(thread_state[0].state.buckets); libsais64_free_aligned(thread_state); } } #if defined(LIBSAIS_OPENMP) static sa_sint_t libsais64_count_negative_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { sa_sint_t count = 0; fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] < 0); } return count; } static sa_sint_t libsais64_count_zero_marked_suffixes(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { sa_sint_t count = 0; fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { count += (SA[i] == 0); } return count; } static void libsais64_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais64_prefetchr(&cache[i + 2 * prefetch_distance]); libsais64_prefetchw(&SA[cache[i + prefetch_distance + 0].symbol]); libsais64_prefetchw(&SA[cache[i + prefetch_distance + 1].symbol]); libsais64_prefetchw(&SA[cache[i + prefetch_distance + 2].symbol]); libsais64_prefetchw(&SA[cache[i + prefetch_distance + 3].symbol]); SA[cache[i + 0].symbol] = cache[i + 0].index; SA[cache[i + 1].symbol] = cache[i + 1].index; SA[cache[i + 2].symbol] = cache[i + 2].index; SA[cache[i + 3].symbol] = cache[i + 3].index; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[cache[i].symbol] = cache[i].index; } } static void libsais64_compact_and_place_cached_suffixes(sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { libsais64_prefetchw(&cache[i + prefetch_distance]); cache[l] = cache[i + 0]; l += cache[l].symbol >= 0; cache[l] = cache[i + 1]; l += cache[l].symbol >= 0; cache[l] = cache[i + 2]; l += cache[l].symbol >= 0; cache[l] = cache[i + 3]; l += cache[l].symbol >= 0; } for (j += 3; i < j; i += 1) { cache[l] = cache[i]; l += cache[l].symbol >= 0; } libsais64_place_cached_suffixes(SA, cache, omp_block_start, l - omp_block_start); } static void libsais64_accumulate_counts_s32_2(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s]; } } static void libsais64_accumulate_counts_s32_3(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s]; } } static void libsais64_accumulate_counts_s32_4(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s]; } } static void libsais64_accumulate_counts_s32_5(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s]; } } static void libsais64_accumulate_counts_s32_6(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s]; } } static void libsais64_accumulate_counts_s32_7(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s]; } } static void libsais64_accumulate_counts_s32_8(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s]; } } static void libsais64_accumulate_counts_s32_9(sa_sint_t * RESTRICT bucket00, fast_sint_t bucket_size, fast_sint_t bucket_stride) { sa_sint_t * RESTRICT bucket01 = bucket00 - bucket_stride; sa_sint_t * RESTRICT bucket02 = bucket01 - bucket_stride; sa_sint_t * RESTRICT bucket03 = bucket02 - bucket_stride; sa_sint_t * RESTRICT bucket04 = bucket03 - bucket_stride; sa_sint_t * RESTRICT bucket05 = bucket04 - bucket_stride; sa_sint_t * RESTRICT bucket06 = bucket05 - bucket_stride; sa_sint_t * RESTRICT bucket07 = bucket06 - bucket_stride; sa_sint_t * RESTRICT bucket08 = bucket07 - bucket_stride; fast_sint_t s; for (s = 0; s < bucket_size; s += 1) { bucket00[s] = bucket00[s] + bucket01[s] + bucket02[s] + bucket03[s] + bucket04[s] + bucket05[s] + bucket06[s] + bucket07[s] + bucket08[s]; } } static void libsais64_accumulate_counts_s32(sa_sint_t * RESTRICT buckets, fast_sint_t bucket_size, fast_sint_t bucket_stride, fast_sint_t num_buckets) { while (num_buckets >= 9) { libsais64_accumulate_counts_s32_9(buckets - (num_buckets - 9) * bucket_stride, bucket_size, bucket_stride); num_buckets -= 8; } switch (num_buckets) { case 1: break; case 2: libsais64_accumulate_counts_s32_2(buckets, bucket_size, bucket_stride); break; case 3: libsais64_accumulate_counts_s32_3(buckets, bucket_size, bucket_stride); break; case 4: libsais64_accumulate_counts_s32_4(buckets, bucket_size, bucket_stride); break; case 5: libsais64_accumulate_counts_s32_5(buckets, bucket_size, bucket_stride); break; case 6: libsais64_accumulate_counts_s32_6(buckets, bucket_size, bucket_stride); break; case 7: libsais64_accumulate_counts_s32_7(buckets, bucket_size, bucket_stride); break; case 8: libsais64_accumulate_counts_s32_8(buckets, bucket_size, bucket_stride); break; default: break; } } #endif static void libsais64_flip_suffix_markers_omp(sa_sint_t * RESTRICT SA, sa_sint_t l, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && l >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (l / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : l - omp_block_start; fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { SA[i] ^= SAINT_MIN; } } } static void libsais64_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, fast_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 256; fast_sint_t i, j = omp_block_start + omp_block_size, c0 = T[omp_block_start + omp_block_size - 1], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = omp_block_start + omp_block_size - 2, j = omp_block_start + 3; i >= j; i -= 4) { libsais64_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); } for (j -= 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); } SA[m] = (sa_sint_t)(i + 1); } } static void libsais64_gather_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais64_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t > omp_thread_num; --t) { m += thread_state[t].state.m; } libsais64_gather_lms_suffixes_8u(T, SA, n, (fast_sint_t)n - 1 - m, omp_block_start, omp_block_size); #pragma omp barrier if (thread_state[omp_thread_num].state.m > 0) { SA[(fast_sint_t)n - 1 - m] = (sa_sint_t)thread_state[omp_thread_num].state.last_lms_suffix; } } #endif } } static sa_sint_t libsais64_gather_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; sa_sint_t i = n - 2; sa_sint_t m = n - 1; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= 3; i -= 4) { libsais64_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i + 1; m -= (sa_sint_t)(f1 & ~f0); c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 0; m -= (sa_sint_t)(f0 & ~f1); c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i - 1; m -= (sa_sint_t)(f1 & ~f0); c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 2; m -= (sa_sint_t)(f0 & ~f1); } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i + 1; m -= (sa_sint_t)(f0 & ~f1); } return n - 1 - m; } static sa_sint_t libsais64_gather_compacted_lms_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; sa_sint_t i = n - 2; sa_sint_t m = n - 1; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= 3; i -= 4) { libsais64_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i + 1; m -= (sa_sint_t)(f1 & ~f0 & (c0 >= 0)); c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 0; m -= (sa_sint_t)(f0 & ~f1 & (c1 >= 0)); c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = i - 1; m -= (sa_sint_t)(f1 & ~f0 & (c0 >= 0)); c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i - 2; m -= (sa_sint_t)(f0 & ~f1 & (c1 >= 0)); } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = i + 1; m -= (sa_sint_t)(f0 & ~f1 & (c1 >= 0)); } return n - 1 - m; } #if defined(LIBSAIS_OPENMP) static void libsais64_count_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t i = n - 2; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais64_prefetchr(&T[i - 2 * prefetch_distance]); libsais64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0)]++; } #endif static void libsais64_count_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t i = n - 2; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais64_prefetchr(&T[i - 2 * prefetch_distance]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++; } #if defined(LIBSAIS_OPENMP) static void libsais64_count_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t i = n - 2; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais64_prefetchr(&T[i - 2 * prefetch_distance]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, 0)]++; } #endif static sa_sint_t libsais64_count_and_gather_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 256; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + 3; i >= j; i -= 4) { libsais64_prefetchr(&T[i - prefetch_distance]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } for (j -= 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } static sa_sint_t libsais64_count_and_gather_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { m = libsais64_count_and_gather_lms_suffixes_8u(T, SA, n, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.m = libsais64_count_and_gather_lms_suffixes_8u(T, SA, n, thread_state[omp_thread_num].state.buckets, omp_block_start, omp_block_size); if (thread_state[omp_thread_num].state.m > 0) { thread_state[omp_thread_num].state.last_lms_suffix = SA[thread_state[omp_thread_num].state.position - 1]; } } #pragma omp barrier #pragma omp master { memset(buckets, 0, (size_t)4 * ALPHABET_SIZE * sizeof(sa_sint_t)); fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { m += (sa_sint_t)thread_state[t].state.m; if (t != omp_num_threads - 1 && thread_state[t].state.m > 0) { memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.m], (size_t)thread_state[t].state.m * sizeof(sa_sint_t)); } { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t s; for (s = 0; s < 4 * ALPHABET_SIZE; s += 1) { sa_sint_t A = buckets[s], B = temp_bucket[s]; buckets[s] = A + B; temp_bucket[s] = A; } } } } } #endif } return m; } static sa_sint_t libsais64_count_and_gather_lms_suffixes_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, 4 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais64_prefetchr(&T[i - 2 * prefetch_distance]); libsais64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 0], 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 1], 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 2], 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX4(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0 + f1)]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } static sa_sint_t libsais64_count_and_gather_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais64_prefetchr(&T[i - 2 * prefetch_distance]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0], 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1], 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2], 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3], 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1); buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0); buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } static sa_sint_t libsais64_count_and_gather_compacted_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t m = omp_block_start + omp_block_size - 1; if (omp_block_size > 0) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j = m + 1, c0 = T[m], c1 = -1; while (j < n && (c1 = T[j]) == c0) { ++j; } fast_uint_t f0 = c0 >= c1, f1 = 0; for (i = m - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais64_prefetchr(&T[i - 2 * prefetch_distance]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 0] & SAINT_MAX, 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 1] & SAINT_MAX, 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 2] & SAINT_MAX, 0)]); libsais64_prefetchw(&buckets[BUCKETS_INDEX2(T[i - prefetch_distance - 3] & SAINT_MAX, 0)]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0 & (c0 >= 0)); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 0); m -= (f0 & ~f1 & (c1 >= 0)); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i - 1); m -= (f1 & ~f0 & (c0 >= 0)); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i - 2); m -= (f0 & ~f1 & (c1 >= 0)); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); SA[m] = (sa_sint_t)(i + 1); m -= (f0 & ~f1 & (c1 >= 0)); c1 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c1, (f0 & ~f1))]++; } c1 = (i >= 0) ? T[i] : -1; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); SA[m] = (sa_sint_t)(i + 1); m -= (f1 & ~f0 & (c0 >= 0)); c0 &= SAINT_MAX; buckets[BUCKETS_INDEX2((fast_uint_t)c0, (f1 & ~f0))]++; } return (sa_sint_t)(omp_block_start + omp_block_size - 1 - m); } #if defined(LIBSAIS_OPENMP) static fast_sint_t libsais64_get_bucket_stride(fast_sint_t free_space, fast_sint_t bucket_size, fast_sint_t num_buckets) { fast_sint_t bucket_size_1024 = (bucket_size + 1023) & (-1024); if (free_space / (num_buckets - 1) >= bucket_size_1024) { return bucket_size_1024; } fast_sint_t bucket_size_16 = (bucket_size + 15) & (-16); if (free_space / (num_buckets - 1) >= bucket_size_16) { return bucket_size_16; } return bucket_size; } static sa_sint_t libsais64_count_and_gather_lms_suffixes_32s_4k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(local_buckets); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { m = libsais64_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t bucket_size = 4 * (fast_sint_t)k; fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : buckets - &SA[n]; fast_sint_t bucket_stride = libsais64_get_bucket_stride(free_space, bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = libsais64_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); } #pragma omp barrier if (omp_thread_num == omp_num_threads - 1) { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { m += (sa_sint_t)thread_state[t].state.count; if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) { memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } } } else { omp_num_threads = omp_num_threads - 1; omp_block_stride = (bucket_size / omp_num_threads) & (-16); omp_block_start = omp_thread_num * omp_block_stride; omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; libsais64_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1); } } #endif } return m; } static sa_sint_t libsais64_count_and_gather_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(local_buckets); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { m = libsais64_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t bucket_size = 2 * (fast_sint_t)k; fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : buckets - &SA[n]; fast_sint_t bucket_stride = libsais64_get_bucket_stride(free_space, bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = libsais64_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); } #pragma omp barrier if (omp_thread_num == omp_num_threads - 1) { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { m += (sa_sint_t)thread_state[t].state.count; if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) { memcpy(&SA[n - m], &SA[thread_state[t].state.position - thread_state[t].state.count], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } } } else { omp_num_threads = omp_num_threads - 1; omp_block_stride = (bucket_size / omp_num_threads) & (-16); omp_block_start = omp_thread_num * omp_block_stride; omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; libsais64_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads + 1); } } #endif } return m; } static void libsais64_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(local_buckets); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais64_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t bucket_size = 2 * (fast_sint_t)k; fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : buckets - &SA[(fast_sint_t)n + (fast_sint_t)n]; fast_sint_t bucket_stride = libsais64_get_bucket_stride(free_space, bucket_size, omp_num_threads); { thread_state[omp_thread_num].state.position = omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = libsais64_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA + n, n, k, buckets - (omp_thread_num * bucket_stride), omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, m = 0; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) { m += (sa_sint_t)thread_state[t].state.count; } if (thread_state[omp_thread_num].state.count > 0) { memcpy(&SA[n - m], &SA[n + thread_state[omp_thread_num].state.position - thread_state[omp_thread_num].state.count], (size_t)thread_state[omp_thread_num].state.count * sizeof(sa_sint_t)); } } { omp_block_stride = (bucket_size / omp_num_threads) & (-16); omp_block_start = omp_thread_num * omp_block_stride; omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : bucket_size - omp_block_start; libsais64_accumulate_counts_s32(buckets + omp_block_start, omp_block_size, bucket_stride, omp_num_threads); } } #endif } } #endif static sa_sint_t libsais64_count_and_gather_lms_suffixes_32s_4k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { m = libsais64_count_and_gather_lms_suffixes_32s_4k(T, SA, n, k, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else if (omp_thread_num == 0) { libsais64_count_lms_suffixes_32s_4k(T, n, k, buckets); } else { m = libsais64_gather_lms_suffixes_32s(T, SA, n); } #endif } return m; } static sa_sint_t libsais64_count_and_gather_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { m = libsais64_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else if (omp_thread_num == 0) { libsais64_count_lms_suffixes_32s_2k(T, n, k, buckets); } else { m = libsais64_gather_lms_suffixes_32s(T, SA, n); } #endif } return m; } static sa_sint_t libsais64_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads) { sa_sint_t m = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(2) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { m = libsais64_count_and_gather_compacted_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else if (omp_thread_num == 0) { libsais64_count_compacted_lms_suffixes_32s_2k(T, n, k, buckets); } else { m = libsais64_gather_compacted_lms_suffixes_32s(T, SA, n); } #endif } return m; } static sa_sint_t libsais64_count_and_gather_lms_suffixes_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m; #if defined(LIBSAIS_OPENMP) fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); sa_sint_t max_threads = (sa_sint_t)(free_space / ((4 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 16 / k) { max_threads = n / 16 / k; } m = libsais64_count_and_gather_lms_suffixes_32s_4k_fs_omp(T, SA, n, k, buckets, local_buckets, max_threads > 2 ? max_threads : 2, thread_state); } else #else UNUSED(local_buckets); UNUSED(thread_state); #endif { m = libsais64_count_and_gather_lms_suffixes_32s_4k_nofs_omp(T, SA, n, k, buckets, threads); } return m; } static sa_sint_t libsais64_count_and_gather_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t m; #if defined(LIBSAIS_OPENMP) fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[n]); sa_sint_t max_threads = (sa_sint_t)(free_space / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } m = libsais64_count_and_gather_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, local_buckets, max_threads > 2 ? max_threads : 2, thread_state); } else #else UNUSED(local_buckets); UNUSED(thread_state); #endif { m = libsais64_count_and_gather_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); } return m; } static void libsais64_count_and_gather_compacted_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) fast_sint_t free_space = local_buckets ? LIBSAIS_LOCAL_BUFFER_SIZE : (buckets - &SA[(fast_sint_t)n + (fast_sint_t)n]); sa_sint_t max_threads = (sa_sint_t)(free_space / ((2 * (fast_sint_t)k + 15) & (-16))); if (max_threads > threads) { max_threads = threads; } if (!local_buckets && max_threads > 1 && n >= 65536 && n / k >= 2) { if (max_threads > n / 8 / k) { max_threads = n / 8 / k; } libsais64_count_and_gather_compacted_lms_suffixes_32s_2k_fs_omp(T, SA, n, k, buckets, local_buckets, max_threads > 2 ? max_threads : 2, thread_state); } else #else UNUSED(local_buckets); UNUSED(thread_state); #endif { libsais64_count_and_gather_compacted_lms_suffixes_32s_2k_nofs_omp(T, SA, n, k, buckets, threads); } } static void libsais64_count_suffixes_32s(const sa_sint_t * RESTRICT T, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j; for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) { libsais64_prefetchr(&T[i + prefetch_distance]); buckets[T[i + 0]]++; buckets[T[i + 1]]++; buckets[T[i + 2]]++; buckets[T[i + 3]]++; buckets[T[i + 4]]++; buckets[T[i + 5]]++; buckets[T[i + 6]]++; buckets[T[i + 7]]++; } for (j += 7; i < j; i += 1) { buckets[T[i]]++; } } static sa_sint_t libsais64_initialize_buckets_start_and_end_8u(sa_sint_t * RESTRICT buckets, sa_sint_t * RESTRICT freq) { sa_sint_t * RESTRICT bucket_start = &buckets[6 * ALPHABET_SIZE]; sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; fast_sint_t k = -1; if (freq != NULL) { fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { sa_sint_t total = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; bucket_start[j] = sum; sum += total; bucket_end[j] = sum; k = total > 0 ? j : k; freq[j] = total; } } else { fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { sa_sint_t total = buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; bucket_start[j] = sum; sum += total; bucket_end[j] = sum; k = total > 0 ? j : k; } } return (sa_sint_t)(k + 1); } static void libsais64_initialize_buckets_start_and_end_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { sa_sint_t * RESTRICT bucket_start = &buckets[4 * (fast_sint_t)k]; sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { bucket_start[j] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 2)] + buckets[i + BUCKETS_INDEX4(0, 3)]; bucket_end[j] = sum; } } static void libsais64_initialize_buckets_start_and_end_32s_4k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) { bucket_start[j] = sum; sum += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; bucket_end[j] = sum; } } static void libsais64_initialize_buckets_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i; sa_sint_t sum0 = 0; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) { sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; } } static void libsais64_initialize_buckets_start_and_end_32s_2k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i, j; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) { buckets[j] = buckets[i]; } buckets[k] = 0; memcpy(&buckets[k + 1], buckets, ((size_t)k - 1) * sizeof(sa_sint_t)); } static void libsais64_initialize_buckets_start_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i; sa_sint_t sum = 0; for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sa_sint_t tmp = buckets[i]; buckets[i] = sum; sum += tmp; } } static void libsais64_initialize_buckets_end_32s_1k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { fast_sint_t i; sa_sint_t sum = 0; for (i = 0; i <= (fast_sint_t)k - 1; i += 1) { sum += buckets[i]; buckets[i] = sum; } } static sa_sint_t libsais64_initialize_buckets_for_lms_suffixes_radix_sort_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { { fast_uint_t f0 = 0; fast_uint_t f1 = 0; fast_sint_t c0 = T[first_lms_suffix]; fast_sint_t c1 = 0; for (; --first_lms_suffix >= 0; ) { c1 = c0; c0 = T[first_lms_suffix]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]--; } buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0)]--; } { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum; sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; } return sum; } } static void libsais64_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; fast_sint_t i; sa_sint_t sum0 = 0, sum1 = 0; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) { sum0 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; sum1 += buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 0)] = sum0; buckets[i + BUCKETS_INDEX2(0, 1)] = sum1; } } static sa_sint_t libsais64_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { { fast_uint_t f0 = 0; fast_uint_t f1 = 0; fast_sint_t c0 = T[first_lms_suffix]; fast_sint_t c1 = 0; for (; --first_lms_suffix >= 0; ) { c1 = c0; c0 = T[first_lms_suffix]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); buckets[BUCKETS_INDEX4((fast_uint_t)c1, f1 + f1 + f0)]--; } buckets[BUCKETS_INDEX4((fast_uint_t)c0, f0 + f0)]--; } { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum = 0; for (i = BUCKETS_INDEX4(0, 0), j = 0; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += 1) { sum += buckets[i + BUCKETS_INDEX4(0, 1)] + buckets[i + BUCKETS_INDEX4(0, 3)]; temp_bucket[j] = sum; } return sum; } } static void libsais64_initialize_buckets_for_radix_and_partial_sorting_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix) { sa_sint_t * RESTRICT bucket_start = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 0)]++; buckets[BUCKETS_INDEX2(T[first_lms_suffix], 1)]--; fast_sint_t i, j; sa_sint_t sum0 = 0, sum1 = 0; for (i = BUCKETS_INDEX2(0, 0), j = 0; i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0), j += 1) { bucket_start[j] = sum1; sum0 += buckets[i + BUCKETS_INDEX2(0, 1)]; sum1 += buckets[i + BUCKETS_INDEX2(0, 0)] + buckets[i + BUCKETS_INDEX2(0, 1)]; buckets[i + BUCKETS_INDEX2(0, 1)] = sum0; bucket_end[j] = sum1; } } static void libsais64_radix_sort_lms_suffixes_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais64_prefetchr(&SA[i - 2 * prefetch_distance]); libsais64_prefetchr(&T[SA[i - prefetch_distance - 0]]); libsais64_prefetchr(&T[SA[i - prefetch_distance - 1]]); libsais64_prefetchr(&T[SA[i - prefetch_distance - 2]]); libsais64_prefetchr(&T[SA[i - prefetch_distance - 3]]); sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; } } static void libsais64_radix_sort_lms_suffixes_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t flags, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (flags & LIBSAIS_FLAGS_GSA) { buckets[4 * ALPHABET_SIZE]--; } #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536 && m >= 65536 && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_num_threads = 1; #endif if (omp_num_threads == 1) { libsais64_radix_sort_lms_suffixes_8u(T, SA, &buckets[4 * ALPHABET_SIZE], (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); } #if defined(LIBSAIS_OPENMP) else { { sa_sint_t * RESTRICT src_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT dst_bucket = thread_state[omp_thread_num].state.buckets; fast_sint_t i, j; for (i = BUCKETS_INDEX2(0, 0), j = BUCKETS_INDEX4(0, 1); i <= BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX2(1, 0), j += BUCKETS_INDEX4(1, 0)) { dst_bucket[i] = src_bucket[i] - dst_bucket[j]; } } { fast_sint_t t, omp_block_start = 0, omp_block_size = thread_state[omp_thread_num].state.m; for (t = omp_num_threads - 1; t >= omp_thread_num; --t) omp_block_start += thread_state[t].state.m; if (omp_block_start == (fast_sint_t)m && omp_block_size > 0) { omp_block_start -= 1; omp_block_size -= 1; } libsais64_radix_sort_lms_suffixes_8u(T, SA, thread_state[omp_thread_num].state.buckets, (fast_sint_t)n - omp_block_start, omp_block_size); } } #endif } } static void libsais64_radix_sort_lms_suffixes_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) { libsais64_prefetchr(&SA[i - 3 * prefetch_distance]); libsais64_prefetchr(&T[SA[i - 2 * prefetch_distance - 0]]); libsais64_prefetchr(&T[SA[i - 2 * prefetch_distance - 1]]); libsais64_prefetchr(&T[SA[i - 2 * prefetch_distance - 2]]); libsais64_prefetchr(&T[SA[i - 2 * prefetch_distance - 3]]); libsais64_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 0]]]); libsais64_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 1]]]); libsais64_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 2]]]); libsais64_prefetchw(&induction_bucket[T[SA[i - prefetch_distance - 3]]]); sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[T[p0]]] = p0; sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[T[p1]]] = p1; sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[T[p2]]] = p2; sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[T[p3]]] = p3; } for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[--induction_bucket[T[p]]] = p; } } static void libsais64_radix_sort_lms_suffixes_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 3; i >= j; i -= 4) { libsais64_prefetchr(&SA[i - 3 * prefetch_distance]); libsais64_prefetchr(&T[SA[i - 2 * prefetch_distance - 0]]); libsais64_prefetchr(&T[SA[i - 2 * prefetch_distance - 1]]); libsais64_prefetchr(&T[SA[i - 2 * prefetch_distance - 2]]); libsais64_prefetchr(&T[SA[i - 2 * prefetch_distance - 3]]); libsais64_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 0]], 0)]); libsais64_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 1]], 0)]); libsais64_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 2]], 0)]); libsais64_prefetchw(&induction_bucket[BUCKETS_INDEX2(T[SA[i - prefetch_distance - 3]], 0)]); sa_sint_t p0 = SA[i - 0]; SA[--induction_bucket[BUCKETS_INDEX2(T[p0], 0)]] = p0; sa_sint_t p1 = SA[i - 1]; SA[--induction_bucket[BUCKETS_INDEX2(T[p1], 0)]] = p1; sa_sint_t p2 = SA[i - 2]; SA[--induction_bucket[BUCKETS_INDEX2(T[p2], 0)]] = p2; sa_sint_t p3 = SA[i - 3]; SA[--induction_bucket[BUCKETS_INDEX2(T[p3], 0)]] = p3; } for (j -= 2 * prefetch_distance + 3; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[--induction_bucket[BUCKETS_INDEX2(T[p], 0)]] = p; } } #if defined(LIBSAIS_OPENMP) static void libsais64_radix_sort_lms_suffixes_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais64_prefetchr(&T[SA[i + prefetch_distance + 0]]); libsais64_prefetchr(&T[SA[i + prefetch_distance + 1]]); libsais64_prefetchr(&T[SA[i + prefetch_distance + 2]]); libsais64_prefetchr(&T[SA[i + prefetch_distance + 3]]); libsais64_prefetchw(&cache[i + prefetch_distance]); cache[i + 0].symbol = T[cache[i + 0].index = SA[i + 0]]; cache[i + 1].symbol = T[cache[i + 1].index = SA[i + 1]]; cache[i + 2].symbol = T[cache[i + 2].index = SA[i + 2]]; cache[i + 3].symbol = T[cache[i + 3].index = SA[i + 3]]; } for (j += prefetch_distance + 3; i < j; i += 1) { cache[i].symbol = T[cache[i].index = SA[i]]; } } static void libsais64_radix_sort_lms_suffixes_32s_6k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais64_prefetchw(&cache[i - 2 * prefetch_distance]); libsais64_prefetchw(&induction_bucket[cache[i - prefetch_distance - 0].symbol]); libsais64_prefetchw(&induction_bucket[cache[i - prefetch_distance - 1].symbol]); libsais64_prefetchw(&induction_bucket[cache[i - prefetch_distance - 2].symbol]); libsais64_prefetchw(&induction_bucket[cache[i - prefetch_distance - 3].symbol]); cache[i - 0].symbol = --induction_bucket[cache[i - 0].symbol]; cache[i - 1].symbol = --induction_bucket[cache[i - 1].symbol]; cache[i - 2].symbol = --induction_bucket[cache[i - 2].symbol]; cache[i - 3].symbol = --induction_bucket[cache[i - 3].symbol]; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { cache[i].symbol = --induction_bucket[cache[i].symbol]; } } static void libsais64_radix_sort_lms_suffixes_32s_2k_block_sort(sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 3; i >= j; i -= 4) { libsais64_prefetchw(&cache[i - 2 * prefetch_distance]); libsais64_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 0].symbol, 0)]); libsais64_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 1].symbol, 0)]); libsais64_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 2].symbol, 0)]); libsais64_prefetchw(&induction_bucket[BUCKETS_INDEX2(cache[i - prefetch_distance - 3].symbol, 0)]); cache[i - 0].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 0].symbol, 0)]; cache[i - 1].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 1].symbol, 0)]; cache[i - 2].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 2].symbol, 0)]; cache[i - 3].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i - 3].symbol, 0)]; } for (j -= prefetch_distance + 3; i >= j; i -= 1) { cache[i].symbol = --induction_bucket[BUCKETS_INDEX2(cache[i].symbol, 0)]; } } static void libsais64_radix_sort_lms_suffixes_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais64_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais64_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais64_radix_sort_lms_suffixes_32s_6k_block_sort(induction_bucket, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais64_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } static void libsais64_radix_sort_lms_suffixes_32s_2k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais64_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais64_radix_sort_lms_suffixes_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais64_radix_sort_lms_suffixes_32s_2k_block_sort(induction_bucket, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais64_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static void libsais64_radix_sort_lms_suffixes_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || m < 65536) { libsais64_radix_sort_lms_suffixes_32s_6k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; } libsais64_radix_sort_lms_suffixes_32s_6k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static void libsais64_radix_sort_lms_suffixes_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || m < 65536) { libsais64_radix_sort_lms_suffixes_32s_2k(T, SA, induction_bucket, (fast_sint_t)n - (fast_sint_t)m + 1, (fast_sint_t)m - 1); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < (fast_sint_t)m - 1; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end >= m) { block_end = (fast_sint_t)m - 1; } libsais64_radix_sort_lms_suffixes_32s_2k_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, (fast_sint_t)n - block_end, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static sa_sint_t libsais64_radix_sort_lms_suffixes_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; sa_sint_t i = n - 2; sa_sint_t m = 0; fast_uint_t f0 = 1; fast_uint_t f1 = 0; fast_sint_t c0 = T[n - 1]; fast_sint_t c1 = 0; fast_sint_t c2 = 0; for (; i >= prefetch_distance + 3; i -= 4) { libsais64_prefetchr(&T[i - 2 * prefetch_distance]); libsais64_prefetchw(&buckets[T[i - prefetch_distance - 0]]); libsais64_prefetchw(&buckets[T[i - prefetch_distance - 1]]); libsais64_prefetchw(&buckets[T[i - prefetch_distance - 2]]); libsais64_prefetchw(&buckets[T[i - prefetch_distance - 3]]); c1 = T[i - 0]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); if (f1 & ~f0) { SA[--buckets[c2 = c0]] = i + 1; m++; } c0 = T[i - 1]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); if (f0 & ~f1) { SA[--buckets[c2 = c1]] = i - 0; m++; } c1 = T[i - 2]; f1 = (fast_uint_t)(c1 > (c0 - (fast_sint_t)(f0))); if (f1 & ~f0) { SA[--buckets[c2 = c0]] = i - 1; m++; } c0 = T[i - 3]; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); if (f0 & ~f1) { SA[--buckets[c2 = c1]] = i - 2; m++; } } for (; i >= 0; i -= 1) { c1 = c0; c0 = T[i]; f1 = f0; f0 = (fast_uint_t)(c0 > (c1 - (fast_sint_t)(f1))); if (f0 & ~f1) { SA[--buckets[c2 = c1]] = i + 1; m++; } } if (m > 1) { SA[buckets[c2]] = 0; } return m; } static void libsais64_radix_sort_set_markers_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais64_prefetchr(&induction_bucket[i + 2 * prefetch_distance]); libsais64_prefetchw(&SA[induction_bucket[i + prefetch_distance + 0]]); libsais64_prefetchw(&SA[induction_bucket[i + prefetch_distance + 1]]); libsais64_prefetchw(&SA[induction_bucket[i + prefetch_distance + 2]]); libsais64_prefetchw(&SA[induction_bucket[i + prefetch_distance + 3]]); SA[induction_bucket[i + 0]] |= SAINT_MIN; SA[induction_bucket[i + 1]] |= SAINT_MIN; SA[induction_bucket[i + 2]] |= SAINT_MIN; SA[induction_bucket[i + 3]] |= SAINT_MIN; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[induction_bucket[i]] |= SAINT_MIN; } } static void libsais64_radix_sort_set_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais64_prefetchr(&induction_bucket[BUCKETS_INDEX2(i + 2 * prefetch_distance, 0)]); libsais64_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 0, 0)]]); libsais64_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 1, 0)]]); libsais64_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 2, 0)]]); libsais64_prefetchw(&SA[induction_bucket[BUCKETS_INDEX2(i + prefetch_distance + 3, 0)]]); SA[induction_bucket[BUCKETS_INDEX2(i + 0, 0)]] |= SUFFIX_GROUP_MARKER; SA[induction_bucket[BUCKETS_INDEX2(i + 1, 0)]] |= SUFFIX_GROUP_MARKER; SA[induction_bucket[BUCKETS_INDEX2(i + 2, 0)]] |= SUFFIX_GROUP_MARKER; SA[induction_bucket[BUCKETS_INDEX2(i + 3, 0)]] |= SUFFIX_GROUP_MARKER; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[induction_bucket[BUCKETS_INDEX2(i, 0)]] |= SUFFIX_GROUP_MARKER; } } static void libsais64_radix_sort_set_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)k - 1; #endif libsais64_radix_sort_set_markers_32s_6k(SA, induction_bucket, omp_block_start, omp_block_size); } } static void libsais64_radix_sort_set_markers_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && k >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)k - 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)k - 1 - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)k - 1; #endif libsais64_radix_sort_set_markers_32s_4k(SA, induction_bucket, omp_block_start, omp_block_size); } } static void libsais64_initialize_buckets_for_partial_sorting_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; buckets[BUCKETS_INDEX4((fast_uint_t)T[first_lms_suffix], 1)]++; fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0; for (i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX4(ALPHABET_SIZE - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; sum0 += buckets[i + BUCKETS_INDEX4(0, 0)] + buckets[i + BUCKETS_INDEX4(0, 2)]; sum1 += buckets[i + BUCKETS_INDEX4(0, 1)]; buckets[j + BUCKETS_INDEX2(0, 0)] = sum0; buckets[j + BUCKETS_INDEX2(0, 1)] = sum1; } } static void libsais64_initialize_buckets_for_partial_sorting_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count) { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i, j; sa_sint_t sum0 = left_suffixes_count + 1, sum1 = 0, sum2 = 0; for (first_lms_suffix = T[first_lms_suffix], i = BUCKETS_INDEX4(0, 0), j = BUCKETS_INDEX2(0, 0); i != BUCKETS_INDEX4((fast_sint_t)first_lms_suffix, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)]; sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)]; sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)]; sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)]; buckets[i + BUCKETS_INDEX4(0, 0)] = sum0; buckets[i + BUCKETS_INDEX4(0, 1)] = sum2; buckets[i + BUCKETS_INDEX4(0, 2)] = 0; buckets[i + BUCKETS_INDEX4(0, 3)] = 0; sum0 += SS + SL; sum1 += LS; sum2 += LS + LL; temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1; } for (sum1 += 1; i <= BUCKETS_INDEX4((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX4(1, 0), j += BUCKETS_INDEX2(1, 0)) { sa_sint_t SS = buckets[i + BUCKETS_INDEX4(0, 0)]; sa_sint_t LS = buckets[i + BUCKETS_INDEX4(0, 1)]; sa_sint_t SL = buckets[i + BUCKETS_INDEX4(0, 2)]; sa_sint_t LL = buckets[i + BUCKETS_INDEX4(0, 3)]; buckets[i + BUCKETS_INDEX4(0, 0)] = sum0; buckets[i + BUCKETS_INDEX4(0, 1)] = sum2; buckets[i + BUCKETS_INDEX4(0, 2)] = 0; buckets[i + BUCKETS_INDEX4(0, 3)] = 0; sum0 += SS + SL; sum1 += LS; sum2 += LS + LL; temp_bucket[j + BUCKETS_INDEX2(0, 0)] = sum0; temp_bucket[j + BUCKETS_INDEX2(0, 1)] = sum1; } } static sa_sint_t libsais64_partial_sorting_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); SA[induction_bucket[v0]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); SA[induction_bucket[v1]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); SA[induction_bucket[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } return d; } #if defined(LIBSAIS_OPENMP) static void libsais64_partial_sorting_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; memset(induction_bucket, 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); memset(distinct_names , 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = cache[count].index = SA[i + 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; sa_sint_t p1 = cache[count].index = SA[i + 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); induction_bucket[v]++; distinct_names[v] = d; } state[0].state.position = (fast_sint_t)d - 1; state[0].state.count = count; } static void libsais64_partial_sorting_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { libsais64_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; SA[induction_bucket[v0]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; SA[induction_bucket[v1]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j += 1; i < j; i += 1) { sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; SA[induction_bucket[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } static sa_sint_t libsais64_partial_sorting_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais64_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais64_partial_sorting_scan_left_to_right_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); } #pragma omp barrier #pragma omp master { sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; fast_sint_t c; for (c = 0; c < 2 * k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A + B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * k; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; } } #pragma omp barrier { libsais64_partial_sorting_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); } } #endif } return d; } #endif static sa_sint_t libsais64_partial_sorting_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t * RESTRICT induction_bucket = &buckets[4 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; SA[induction_bucket[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN; distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; if (threads == 1 || left_suffixes_count < 65536) { d = libsais64_partial_sorting_scan_left_to_right_8u(T, SA, buckets, d, 0, left_suffixes_count); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < left_suffixes_count; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > left_suffixes_count) { block_max_end = left_suffixes_count;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] >= T[p - 1]); SA[induction_bucket[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } else { d = libsais64_partial_sorting_scan_left_to_right_8u_block_omp(T, SA, k, buckets, d, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif return d; } static sa_sint_t libsais64_partial_sorting_scan_left_to_right_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchr(&SA[i + 3 * prefetch_distance]); libsais64_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i + 2 * prefetch_distance + 0] & SAINT_MAX] - 2); libsais64_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i + 2 * prefetch_distance + 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i + prefetch_distance + 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais64_prefetchw(&buckets[v0]); sa_sint_t p1 = SA[i + prefetch_distance + 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais64_prefetchw(&buckets[v1]); sa_sint_t p2 = SA[i + 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] >= T[p2 - 1]); SA[buckets[v2]++] = (p2 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d; sa_sint_t p3 = SA[i + 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] >= T[p3 - 1]); SA[buckets[v3]++] = (p3 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d; } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); SA[buckets[v]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; } return d; } static sa_sint_t libsais64_partial_sorting_scan_left_to_right_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 3 * prefetch_distance]); sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais64_prefetchw(&induction_bucket[Ts2]); libsais64_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais64_prefetchw(&induction_bucket[Ts3]); libsais64_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; } } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); SA[induction_bucket[T[p - 1]]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; } } return d; } static void libsais64_partial_sorting_scan_left_to_right_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 3 * prefetch_distance]); sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais64_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais64_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais64_prefetchw(&induction_bucket[T[s2 - 1]]); libsais64_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais64_prefetchw(&induction_bucket[T[s3 - 1]]); libsais64_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { SA[i + 0] = 0; SA[induction_bucket[T[p0 - 1]]++] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { SA[i + 1] = 0; SA[induction_bucket[T[p1 - 1]]++] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); } } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { SA[i] = 0; SA[induction_bucket[T[p - 1]]++] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static void libsais64_partial_sorting_scan_left_to_right_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); libsais64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] >= T[p0 - 1]); } cache[i + 0].symbol = symbol0; sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] >= T[p1 - 1]); } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] >= T[p - 1]); } cache[i].symbol = symbol; } } static void libsais64_partial_sorting_scan_left_to_right_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); libsais64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] < T[p0 - 1]); p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] < T[p1 - 1]); p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] < T[p - 1]); p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX; } } static void libsais64_partial_sorting_scan_left_to_right_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); libsais64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] < T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; p0 = 0; } cache[i + 0].symbol = symbol0; SA[i + 0] = p0 & SAINT_MAX; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] < T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; p1 = 0; } cache[i + 1].symbol = symbol1; SA[i + 1] = p1 & SAINT_MAX; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] < T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; p = 0; } cache[i].symbol = symbol; SA[i] = p & SAINT_MAX; } } static sa_sint_t libsais64_partial_sorting_scan_left_to_right_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&cache[i + 2 * prefetch_distance]); libsais64_prefetchw(&buckets[cache[i + prefetch_distance + 0].symbol]); libsais64_prefetchw(&buckets[cache[i + prefetch_distance + 1].symbol]); sa_sint_t v0 = cache[i + 0].symbol, p0 = cache[i + 0].index; d += (p0 < 0); cache[i + 0].symbol = buckets[v0]++; cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t s = cache[i + 0].symbol, q = (cache[s].index = cache[i + 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } sa_sint_t v1 = cache[i + 1].symbol, p1 = cache[i + 1].index; d += (p1 < 0); cache[i + 1].symbol = buckets[v1]++; cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t s = cache[i + 1].symbol, q = (cache[s].index = cache[i + 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = buckets[v]++; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; if (cache[i].symbol < omp_block_end) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] >= T[q - 1]); } } return d; } static sa_sint_t libsais64_partial_sorting_scan_left_to_right_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&cache[i + 2 * prefetch_distance]); sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 >> 1 : 0]; libsais64_prefetchw(Is0); const sa_sint_t * Ds0 = &distinct_names[s0 > 0 ? s0 : 0]; libsais64_prefetchw(Ds0); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 >> 1 : 0]; libsais64_prefetchw(Is1); const sa_sint_t * Ds1 = &distinct_names[s1 > 0 ? s1 : 0]; libsais64_prefetchw(Ds1); sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { sa_sint_t p0 = cache[i + 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 0].symbol = induction_bucket[v0 >> 1]++; cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 0].index = np & SAINT_MAX; } } sa_sint_t v1 = cache[i + 1].symbol; if (v1 >= 0) { sa_sint_t p1 = cache[i + 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i + 1].symbol = induction_bucket[v1 >> 1]++; cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i + 1].index = np & SAINT_MAX; } } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = induction_bucket[v >> 1]++; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] < T[np - 1]); np = 0; } cache[i].index = np & SAINT_MAX; } } } return d; } static void libsais64_partial_sorting_scan_left_to_right_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&cache[i + 2 * prefetch_distance]); sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais64_prefetchw(Is0); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais64_prefetchw(Is1); sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { cache[i + 0].symbol = induction_bucket[v0]++; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; if (np > 0) { cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 0].index = np & SAINT_MAX; } } sa_sint_t v1 = cache[i + 1].symbol; if (v1 >= 0) { cache[i + 1].symbol = induction_bucket[v1]++; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; if (np > 0) { cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i + 1].index = np & SAINT_MAX; } } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = induction_bucket[v]++; if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] < T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; np = 0; } cache[i].index = np & SAINT_MAX; } } } } static sa_sint_t libsais64_partial_sorting_scan_left_to_right_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais64_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais64_partial_sorting_scan_left_to_right_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais64_partial_sorting_scan_left_to_right_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais64_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static sa_sint_t libsais64_partial_sorting_scan_left_to_right_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais64_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais64_partial_sorting_scan_left_to_right_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais64_partial_sorting_scan_left_to_right_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static void libsais64_partial_sorting_scan_left_to_right_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais64_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais64_partial_sorting_scan_left_to_right_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais64_partial_sorting_scan_left_to_right_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static sa_sint_t libsais64_partial_sorting_scan_left_to_right_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[buckets[BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])]++] = (n - 1) | SAINT_MIN; buckets[2 + BUCKETS_INDEX4(T[n - 1], T[n - 2] >= T[n - 1])] = ++d; if (threads == 1 || left_suffixes_count < 65536) { d = libsais64_partial_sorting_scan_left_to_right_32s_6k(T, SA, buckets, d, 0, left_suffixes_count); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < left_suffixes_count; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > left_suffixes_count) { block_end = left_suffixes_count; } d = libsais64_partial_sorting_scan_left_to_right_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif return d; } static sa_sint_t libsais64_partial_sorting_scan_left_to_right_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t * RESTRICT induction_bucket = &buckets[2 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; SA[induction_bucket[T[n - 1]]++] = (n - 1) | (sa_sint_t)((sa_uint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)) | SUFFIX_GROUP_MARKER; distinct_names[BUCKETS_INDEX2(T[n - 1], T[n - 2] < T[n - 1])] = ++d; if (threads == 1 || n < 65536) { d = libsais64_partial_sorting_scan_left_to_right_32s_4k(T, SA, k, buckets, d, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < n; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } d = libsais64_partial_sorting_scan_left_to_right_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif return d; } static void libsais64_partial_sorting_scan_left_to_right_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[buckets[T[n - 1]]++] = (n - 1) | (sa_sint_t)((sa_uint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais64_partial_sorting_scan_left_to_right_32s_1k(T, SA, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < n; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } libsais64_partial_sorting_scan_left_to_right_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static void libsais64_partial_sorting_shift_markers_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, const sa_sint_t * RESTRICT buckets, sa_sint_t threads) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * ALPHABET_SIZE]; fast_sint_t c; #if defined(LIBSAIS_OPENMP) #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536) #else UNUSED(threads); UNUSED(n); #endif for (c = BUCKETS_INDEX2(ALPHABET_SIZE - 1, 0); c >= BUCKETS_INDEX2(1, 0); c -= BUCKETS_INDEX2(1, 0)) { fast_sint_t i, j; sa_sint_t s = SAINT_MIN; for (i = (fast_sint_t)temp_bucket[c] - 1, j = (fast_sint_t)buckets[c - BUCKETS_INDEX2(1, 0)] + 3; i >= j; i -= 4) { libsais64_prefetchw(&SA[i - prefetch_distance]); sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; } for (j -= 3; i >= j; i -= 1) { sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; } } } static void libsais64_partial_sorting_shift_markers_32s_6k_omp(sa_sint_t * RESTRICT SA, sa_sint_t k, const sa_sint_t * RESTRICT buckets, sa_sint_t threads) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t c; #if defined(LIBSAIS_OPENMP) #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && k >= 65536) #else UNUSED(threads); #endif for (c = (fast_sint_t)k - 1; c >= 1; c -= 1) { fast_sint_t i, j; sa_sint_t s = SAINT_MIN; for (i = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 0)] - 1, j = (fast_sint_t)temp_bucket[BUCKETS_INDEX2(c - 1, 0)] + 3; i >= j; i -= 4) { libsais64_prefetchw(&SA[i - prefetch_distance]); sa_sint_t p0 = SA[i - 0], q0 = (p0 & SAINT_MIN) ^ s; s = s ^ q0; SA[i - 0] = p0 ^ q0; sa_sint_t p1 = SA[i - 1], q1 = (p1 & SAINT_MIN) ^ s; s = s ^ q1; SA[i - 1] = p1 ^ q1; sa_sint_t p2 = SA[i - 2], q2 = (p2 & SAINT_MIN) ^ s; s = s ^ q2; SA[i - 2] = p2 ^ q2; sa_sint_t p3 = SA[i - 3], q3 = (p3 & SAINT_MIN) ^ s; s = s ^ q3; SA[i - 3] = p3 ^ q3; } for (j -= 3; i >= j; i -= 1) { sa_sint_t p = SA[i], q = (p & SAINT_MIN) ^ s; s = s ^ q; SA[i] = p ^ q; } } } static void libsais64_partial_sorting_shift_markers_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; fast_sint_t i; sa_sint_t s = SUFFIX_GROUP_MARKER; for (i = (fast_sint_t)n - 1; i >= 3; i -= 4) { libsais64_prefetchw(&SA[i - prefetch_distance]); sa_sint_t p0 = SA[i - 0], q0 = ((p0 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p0 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q0; SA[i - 0] = p0 ^ q0; sa_sint_t p1 = SA[i - 1], q1 = ((p1 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p1 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q1; SA[i - 1] = p1 ^ q1; sa_sint_t p2 = SA[i - 2], q2 = ((p2 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p2 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q2; SA[i - 2] = p2 ^ q2; sa_sint_t p3 = SA[i - 3], q3 = ((p3 & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p3 > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q3; SA[i - 3] = p3 ^ q3; } for (; i >= 0; i -= 1) { sa_sint_t p = SA[i], q = ((p & SUFFIX_GROUP_MARKER) ^ s) & ((sa_sint_t)(p > 0) << ((SUFFIX_GROUP_BIT - 1))); s = s ^ q; SA[i] = p ^ q; } } static void libsais64_partial_sorting_shift_buckets_32s_6k(sa_sint_t k, sa_sint_t * RESTRICT buckets) { sa_sint_t * RESTRICT temp_bucket = &buckets[4 * (fast_sint_t)k]; fast_sint_t i; for (i = BUCKETS_INDEX2(0, 0); i <= BUCKETS_INDEX2((fast_sint_t)k - 1, 0); i += BUCKETS_INDEX2(1, 0)) { buckets[2 * i + BUCKETS_INDEX4(0, 0)] = temp_bucket[i + BUCKETS_INDEX2(0, 0)]; buckets[2 * i + BUCKETS_INDEX4(0, 1)] = temp_bucket[i + BUCKETS_INDEX2(0, 1)]; } } static sa_sint_t libsais64_partial_sorting_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchr(&SA[i - 2 * prefetch_distance]); libsais64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); libsais64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } return d; } static sa_sint_t libsais64_partial_gsa_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchr(&SA[i - 2 * prefetch_distance]); libsais64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); libsais64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); if (v0 != 1) { SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); if (v1 != 1) { SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); if (v != 1) { SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } return d; } #if defined(LIBSAIS_OPENMP) static void libsais64_partial_sorting_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size, LIBSAIS_THREAD_STATE * RESTRICT state) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; memset(induction_bucket, 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); memset(distinct_names , 0, (size_t)2 * (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; sa_sint_t d = 1; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchr(&SA[i - 2 * prefetch_distance]); libsais64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i - prefetch_distance - 0] & SAINT_MAX] - 2); libsais64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i - prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = cache[count].index = SA[i - 0]; d += (p0 < 0); p0 &= SAINT_MAX; sa_sint_t v0 = cache[count++].symbol = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); induction_bucket[v0]++; distinct_names[v0] = d; sa_sint_t p1 = cache[count].index = SA[i - 1]; d += (p1 < 0); p1 &= SAINT_MAX; sa_sint_t v1 = cache[count++].symbol = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); induction_bucket[v1]++; distinct_names[v1] = d; } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = cache[count].index = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = cache[count++].symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); induction_bucket[v]++; distinct_names[v] = d; } state[0].state.position = (fast_sint_t)d - 1; state[0].state.count = count; } static void libsais64_partial_sorting_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { libsais64_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } for (j += 1; i < j; i += 1) { sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } static void libsais64_partial_gsa_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count, sa_sint_t d) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t i, j; for (i = 0, j = count - 1; i < j; i += 2) { libsais64_prefetchr(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index; d += (p0 < 0); sa_sint_t v0 = cache[i + 0].symbol; if (v0 != 1) { SA[--induction_bucket[v0]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v0] != d) << (SAINT_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = cache[i + 1].index; d += (p1 < 0); sa_sint_t v1 = cache[i + 1].symbol; if (v1 != 1) { SA[--induction_bucket[v1]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v1] != d) << (SAINT_BIT - 1)); distinct_names[v1] = d; } } for (j += 1; i < j; i += 1) { sa_sint_t p = cache[i].index; d += (p < 0); sa_sint_t v = cache[i].symbol; if (v != 1) { SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } } static sa_sint_t libsais64_partial_sorting_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais64_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais64_partial_sorting_scan_right_to_left_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); } #pragma omp barrier #pragma omp master { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; fast_sint_t c; for (c = 0; c < 2 * k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * k; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; } } #pragma omp barrier { libsais64_partial_sorting_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); } } #endif } return d; } static sa_sint_t libsais64_partial_gsa_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais64_partial_gsa_scan_right_to_left_8u(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais64_partial_sorting_scan_right_to_left_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size, &thread_state[omp_thread_num]); } #pragma omp barrier #pragma omp master { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_induction_bucket = &thread_state[t].state.buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT temp_distinct_names = &thread_state[t].state.buckets[2 * ALPHABET_SIZE]; fast_sint_t c; for (c = 0; c < 2 * k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_induction_bucket[c]; induction_bucket[c] = A - B; temp_induction_bucket[c] = A; } for (d -= 1, c = 0; c < 2 * k; c += 1) { sa_sint_t A = distinct_names[c], B = temp_distinct_names[c], D = B + d; distinct_names[c] = B > 0 ? D : A; temp_distinct_names[c] = A; } d += 1 + (sa_sint_t)thread_state[t].state.position; thread_state[t].state.position = (fast_sint_t)d - thread_state[t].state.position; } } #pragma omp barrier { libsais64_partial_gsa_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count, (sa_sint_t)thread_state[omp_thread_num].state.position); } } #endif } return d; } #endif static void libsais64_partial_sorting_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; if (threads == 1 || (scan_end - scan_start) < 65536) { libsais64_partial_sorting_scan_right_to_left_8u(T, SA, buckets, d, scan_start, scan_end - scan_start); } #if defined(LIBSAIS_OPENMP) else { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t block_start; for (block_start = scan_end - 1; block_start >= scan_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } else { d = libsais64_partial_sorting_scan_right_to_left_8u_block_omp(T, SA, k, buckets, d, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais64_partial_gsa_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; if (threads == 1 || (scan_end - scan_start) < 65536) { libsais64_partial_gsa_scan_right_to_left_8u(T, SA, buckets, d, scan_start, scan_end - scan_start); } #if defined(LIBSAIS_OPENMP) else { sa_sint_t * RESTRICT induction_bucket = &buckets[0 * ALPHABET_SIZE]; sa_sint_t * RESTRICT distinct_names = &buckets[2 * ALPHABET_SIZE]; fast_sint_t block_start; for (block_start = scan_end - 1; block_start >= scan_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < scan_start) { block_max_end = scan_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); if (v != 1) { SA[--induction_bucket[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(distinct_names[v] != d) << (SAINT_BIT - 1)); distinct_names[v] = d; } } } else { d = libsais64_partial_gsa_scan_right_to_left_8u_block_omp(T, SA, k, buckets, d, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static sa_sint_t libsais64_partial_sorting_scan_right_to_left_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchr(&SA[i - 3 * prefetch_distance]); libsais64_prefetchr(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i - 2 * prefetch_distance - 0] & SAINT_MAX] - 2); libsais64_prefetchr(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i - 2 * prefetch_distance - 1] & SAINT_MAX] - 2); sa_sint_t p0 = SA[i - prefetch_distance - 0] & SAINT_MAX; sa_sint_t v0 = BUCKETS_INDEX4(T[p0 - (p0 > 0)], 0); libsais64_prefetchw(&buckets[v0]); sa_sint_t p1 = SA[i - prefetch_distance - 1] & SAINT_MAX; sa_sint_t v1 = BUCKETS_INDEX4(T[p1 - (p1 > 0)], 0); libsais64_prefetchw(&buckets[v1]); sa_sint_t p2 = SA[i - 0]; d += (p2 < 0); p2 &= SAINT_MAX; sa_sint_t v2 = BUCKETS_INDEX4(T[p2 - 1], T[p2 - 2] > T[p2 - 1]); SA[--buckets[v2]] = (p2 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v2] != d) << (SAINT_BIT - 1)); buckets[2 + v2] = d; sa_sint_t p3 = SA[i - 1]; d += (p3 < 0); p3 &= SAINT_MAX; sa_sint_t v3 = BUCKETS_INDEX4(T[p3 - 1], T[p3 - 2] > T[p3 - 1]); SA[--buckets[v3]] = (p3 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v3] != d) << (SAINT_BIT - 1)); buckets[2 + v3] = d; } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; d += (p < 0); p &= SAINT_MAX; sa_sint_t v = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); SA[--buckets[v]] = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; } return d; } static sa_sint_t libsais64_partial_sorting_scan_right_to_left_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&SA[i - 3 * prefetch_distance]); sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { const fast_sint_t Ts2 = T[(s2 & ~SUFFIX_GROUP_MARKER) - 1]; libsais64_prefetchw(&induction_bucket[Ts2]); libsais64_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts2, 0)]); } sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { const fast_sint_t Ts3 = T[(s3 & ~SUFFIX_GROUP_MARKER) - 1]; libsais64_prefetchw(&induction_bucket[Ts3]); libsais64_prefetchw(&distinct_names[BUCKETS_INDEX2(Ts3, 0)]); } sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); p0 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; } sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); p1 &= ~SUFFIX_GROUP_MARKER; sa_sint_t v1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; } } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; d += (p >> (SUFFIX_GROUP_BIT - 1)); p &= ~SUFFIX_GROUP_MARKER; sa_sint_t v = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); SA[--induction_bucket[T[p - 1]]] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; } } return d; } static void libsais64_partial_sorting_scan_right_to_left_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&SA[i - 3 * prefetch_distance]); sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais64_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais64_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais64_prefetchw(&induction_bucket[T[s2 - 1]]); libsais64_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais64_prefetchw(&induction_bucket[T[s3 - 1]]); libsais64_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i - 0]; if (p0 > 0) { SA[i - 0] = 0; SA[--induction_bucket[T[p0 - 1]]] = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; if (p1 > 0) { SA[i - 1] = 0; SA[--induction_bucket[T[p1 - 1]]] = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); } } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; if (p > 0) { SA[i] = 0; SA[--induction_bucket[T[p - 1]]] = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static void libsais64_partial_sorting_scan_right_to_left_32s_6k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i + prefetch_distance + 0] & SAINT_MAX] - 2); libsais64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 1); libsais64_prefetchr(&T[SA[i + prefetch_distance + 1] & SAINT_MAX] - 2); libsais64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t p0 = cache[i + 0].index = SA[i + 0]; sa_sint_t symbol0 = 0; p0 &= SAINT_MAX; if (p0 != 0) { symbol0 = BUCKETS_INDEX4(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0; sa_sint_t p1 = cache[i + 1].index = SA[i + 1]; sa_sint_t symbol1 = 0; p1 &= SAINT_MAX; if (p1 != 0) { symbol1 = BUCKETS_INDEX4(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = cache[i].index = SA[i]; sa_sint_t symbol = 0; p &= SAINT_MAX; if (p != 0) { symbol = BUCKETS_INDEX4(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol; } } static void libsais64_partial_sorting_scan_right_to_left_32s_4k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 & ~SUFFIX_GROUP_MARKER : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 & ~SUFFIX_GROUP_MARKER : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); libsais64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = p0; p0 &= ~SUFFIX_GROUP_MARKER; symbol0 = BUCKETS_INDEX2(T[p0 - 1], T[p0 - 2] > T[p0 - 1]); } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = p1; p1 &= ~SUFFIX_GROUP_MARKER; symbol1 = BUCKETS_INDEX2(T[p1 - 1], T[p1 - 2] > T[p1 - 1]); } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = p; p &= ~SUFFIX_GROUP_MARKER; symbol = BUCKETS_INDEX2(T[p - 1], T[p - 2] > T[p - 1]); } cache[i].symbol = symbol; } } static void libsais64_partial_sorting_scan_right_to_left_32s_1k_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); libsais64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; if (p0 > 0) { SA[i + 0] = 0; cache[i + 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(T[p0 - 2] > T[p0 - 1]) << (SAINT_BIT - 1)); symbol0 = T[p0 - 1]; } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; if (p1 > 0) { SA[i + 1] = 0; cache[i + 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(T[p1 - 2] > T[p1 - 1]) << (SAINT_BIT - 1)); symbol1 = T[p1 - 1]; } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; if (p > 0) { SA[i] = 0; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(T[p - 2] > T[p - 1]) << (SAINT_BIT - 1)); symbol = T[p - 1]; } cache[i].symbol = symbol; } } static sa_sint_t libsais64_partial_sorting_scan_right_to_left_32s_6k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&cache[i - 2 * prefetch_distance]); libsais64_prefetchw(&buckets[cache[i - prefetch_distance - 0].symbol]); libsais64_prefetchw(&buckets[cache[i - prefetch_distance - 1].symbol]); sa_sint_t v0 = cache[i - 0].symbol, p0 = cache[i - 0].index; d += (p0 < 0); cache[i - 0].symbol = --buckets[v0]; cache[i - 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v0] != d) << (SAINT_BIT - 1)); buckets[2 + v0] = d; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t s = cache[i - 0].symbol, q = (cache[s].index = cache[i - 0].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } sa_sint_t v1 = cache[i - 1].symbol, p1 = cache[i - 1].index; d += (p1 < 0); cache[i - 1].symbol = --buckets[v1]; cache[i - 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v1] != d) << (SAINT_BIT - 1)); buckets[2 + v1] = d; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t s = cache[i - 1].symbol, q = (cache[s].index = cache[i - 1].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol, p = cache[i].index; d += (p < 0); cache[i].symbol = --buckets[v]; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)(buckets[2 + v] != d) << (SAINT_BIT - 1)); buckets[2 + v] = d; if (cache[i].symbol >= omp_block_start) { sa_sint_t s = cache[i].symbol, q = (cache[s].index = cache[i].index) & SAINT_MAX; cache[s].symbol = BUCKETS_INDEX4(T[q - 1], T[q - 2] > T[q - 1]); } } return d; } static sa_sint_t libsais64_partial_sorting_scan_right_to_left_32s_4k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT induction_bucket = &buckets[3 * (fast_sint_t)k]; sa_sint_t * RESTRICT distinct_names = &buckets[0 * (fast_sint_t)k]; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&cache[i - 2 * prefetch_distance]); sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 >> 1 : 0]; libsais64_prefetchw(Is0); const sa_sint_t * Ds0 = &distinct_names[s0 > 0 ? s0 : 0]; libsais64_prefetchw(Ds0); sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 >> 1 : 0]; libsais64_prefetchw(Is1); const sa_sint_t * Ds1 = &distinct_names[s1 > 0 ? s1 : 0]; libsais64_prefetchw(Ds1); sa_sint_t v0 = cache[i - 0].symbol; if (v0 >= 0) { sa_sint_t p0 = cache[i - 0].index; d += (p0 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 0].symbol = --induction_bucket[v0 >> 1]; cache[i - 0].index = (p0 - 1) | (sa_sint_t)((sa_uint_t)v0 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v0] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v0] = d; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } } sa_sint_t v1 = cache[i - 1].symbol; if (v1 >= 0) { sa_sint_t p1 = cache[i - 1].index; d += (p1 >> (SUFFIX_GROUP_BIT - 1)); cache[i - 1].symbol = --induction_bucket[v1 >> 1]; cache[i - 1].index = (p1 - 1) | (sa_sint_t)((sa_uint_t)v1 << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v1] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v1] = d; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { sa_sint_t p = cache[i].index; d += (p >> (SUFFIX_GROUP_BIT - 1)); cache[i].symbol = --induction_bucket[v >> 1]; cache[i].index = (p - 1) | (sa_sint_t)((sa_uint_t)v << (SAINT_BIT - 1)) | ((sa_sint_t)(distinct_names[v] != d) << (SUFFIX_GROUP_BIT - 1)); distinct_names[v] = d; if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = np; np &= ~SUFFIX_GROUP_MARKER; cache[ni].symbol = BUCKETS_INDEX2(T[np - 1], T[np - 2] > T[np - 1]); } } } } return d; } static void libsais64_partial_sorting_scan_right_to_left_32s_1k_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&cache[i - 2 * prefetch_distance]); sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais64_prefetchw(Is0); sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais64_prefetchw(Is1); sa_sint_t v0 = cache[i - 0].symbol; if (v0 >= 0) { cache[i - 0].symbol = --induction_bucket[v0]; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; if (np > 0) { cache[i - 0].index = 0; cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } } } sa_sint_t v1 = cache[i - 1].symbol; if (v1 >= 0) { cache[i - 1].symbol = --induction_bucket[v1]; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; if (np > 0) { cache[i - 1].index = 0; cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; }} } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = --induction_bucket[v]; if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; if (np > 0) { cache[i].index = 0; cache[ni].index = (np - 1) | (sa_sint_t)((sa_uint_t)(T[np - 2] > T[np - 1]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np - 1]; } } } } } static sa_sint_t libsais64_partial_sorting_scan_right_to_left_32s_6k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais64_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais64_partial_sorting_scan_right_to_left_32s_6k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais64_partial_sorting_scan_right_to_left_32s_6k_block_sort(T, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais64_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static sa_sint_t libsais64_partial_sorting_scan_right_to_left_32s_4k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { d = libsais64_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais64_partial_sorting_scan_right_to_left_32s_4k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { d = libsais64_partial_sorting_scan_right_to_left_32s_4k_block_sort(T, k, buckets, d, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } return d; } static void libsais64_partial_sorting_scan_right_to_left_32s_1k_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais64_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais64_partial_sorting_scan_right_to_left_32s_1k_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais64_partial_sorting_scan_right_to_left_32s_1k_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static sa_sint_t libsais64_partial_sorting_scan_right_to_left_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fast_sint_t scan_start = (fast_sint_t)left_suffixes_count + 1; fast_sint_t scan_end = (fast_sint_t)n - (fast_sint_t)first_lms_suffix; if (threads == 1 || (scan_end - scan_start) < 65536) { d = libsais64_partial_sorting_scan_right_to_left_32s_6k(T, SA, buckets, d, scan_start, scan_end - scan_start); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = scan_end - 1; block_start >= scan_start; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < scan_start) { block_end = scan_start - 1; } d = libsais64_partial_sorting_scan_right_to_left_32s_6k_block_omp(T, SA, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif return d; } static sa_sint_t libsais64_partial_sorting_scan_right_to_left_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t d, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { d = libsais64_partial_sorting_scan_right_to_left_32s_4k(T, SA, k, buckets, d, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } d = libsais64_partial_sorting_scan_right_to_left_32s_4k_block_omp(T, SA, k, buckets, d, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif return d; } static void libsais64_partial_sorting_scan_right_to_left_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { libsais64_partial_sorting_scan_right_to_left_32s_1k(T, SA, buckets, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } libsais64_partial_sorting_scan_right_to_left_32s_1k_block_omp(T, SA, buckets, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif } static fast_sint_t libsais64_partial_sorting_gather_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { libsais64_prefetchr(&SA[i + prefetch_distance]); sa_uint_t s0 = (sa_uint_t)SA[i + 0]; SA[l] = (sa_sint_t)((s0 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s0 < 0); sa_uint_t s1 = (sa_uint_t)SA[i + 1]; SA[l] = (sa_sint_t)((s1 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s1 < 0); sa_uint_t s2 = (sa_uint_t)SA[i + 2]; SA[l] = (sa_sint_t)((s2 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s2 < 0); sa_uint_t s3 = (sa_uint_t)SA[i + 3]; SA[l] = (sa_sint_t)((s3 - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s3 < 0); } for (j += 3; i < j; i += 1) { sa_uint_t s = (sa_uint_t)SA[i]; SA[l] = (sa_sint_t)((s - (sa_uint_t)SUFFIX_GROUP_MARKER) & (sa_uint_t)(~SUFFIX_GROUP_MARKER)); l += ((sa_sint_t)s < 0); } return l; } static fast_sint_t libsais64_partial_sorting_gather_lms_suffixes_32s_1k(sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3, l = omp_block_start; i < j; i += 4) { libsais64_prefetchr(&SA[i + prefetch_distance]); sa_sint_t s0 = SA[i + 0]; SA[l] = s0 & SAINT_MAX; l += (s0 < 0); sa_sint_t s1 = SA[i + 1]; SA[l] = s1 & SAINT_MAX; l += (s1 < 0); sa_sint_t s2 = SA[i + 2]; SA[l] = s2 & SAINT_MAX; l += (s2 < 0); sa_sint_t s3 = SA[i + 3]; SA[l] = s3 & SAINT_MAX; l += (s3 < 0); } for (j += 3; i < j; i += 1) { sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l += (s < 0); } return l; } static void libsais64_partial_sorting_gather_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais64_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = omp_block_start; thread_state[omp_thread_num].state.count = libsais64_partial_sorting_gather_lms_suffixes_32s_4k(SA, omp_block_start, omp_block_size) - omp_block_start; } #pragma omp barrier #pragma omp master { fast_sint_t t, position = 0; for (t = 0; t < omp_num_threads; ++t) { if (t > 0 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } position += thread_state[t].state.count; } } } #endif } } static void libsais64_partial_sorting_gather_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais64_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = omp_block_start; thread_state[omp_thread_num].state.count = libsais64_partial_sorting_gather_lms_suffixes_32s_1k(SA, omp_block_start, omp_block_size) - omp_block_start; } #pragma omp barrier #pragma omp master { fast_sint_t t, position = 0; for (t = 0; t < omp_num_threads; ++t) { if (t > 0 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } position += thread_state[t].state.count; } } } #endif } } static void libsais64_induce_partial_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t flags, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(&buckets[2 * ALPHABET_SIZE], 0, (size_t)2 * ALPHABET_SIZE * sizeof(sa_sint_t)); if (flags & LIBSAIS_FLAGS_GSA) { buckets[4 * ALPHABET_SIZE + BUCKETS_INDEX2(0, 1)] = buckets[4 * ALPHABET_SIZE + BUCKETS_INDEX2(1, 1)] - 1; libsais64_flip_suffix_markers_omp(SA, buckets[4 * ALPHABET_SIZE + BUCKETS_INDEX2(0, 1)], threads); } sa_sint_t d = libsais64_partial_sorting_scan_left_to_right_8u_omp(T, SA, n, k, buckets, left_suffixes_count, 0, threads, thread_state); libsais64_partial_sorting_shift_markers_8u_omp(SA, n, buckets, threads); if (flags & LIBSAIS_FLAGS_GSA) { libsais64_partial_gsa_scan_right_to_left_8u_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); if (T[first_lms_suffix] == 0) { memmove(&SA[1], &SA[0], (size_t)(buckets[BUCKETS_INDEX2(1, 1)] - 1) * sizeof(sa_sint_t)); SA[0] = first_lms_suffix | SAINT_MIN; } buckets[BUCKETS_INDEX2(0, 1)] = 0; } else { libsais64_partial_sorting_scan_right_to_left_8u_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); } } static void libsais64_induce_partial_order_32s_6k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t first_lms_suffix, sa_sint_t left_suffixes_count, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t d = libsais64_partial_sorting_scan_left_to_right_32s_6k_omp(T, SA, n, buckets, left_suffixes_count, 0, threads, thread_state); libsais64_partial_sorting_shift_markers_32s_6k_omp(SA, k, buckets, threads); libsais64_partial_sorting_shift_buckets_32s_6k(k, buckets); libsais64_partial_sorting_scan_right_to_left_32s_6k_omp(T, SA, n, buckets, first_lms_suffix, left_suffixes_count, d, threads, thread_state); } static void libsais64_induce_partial_order_32s_4k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(buckets, 0, 2 * (size_t)k * sizeof(sa_sint_t)); sa_sint_t d = libsais64_partial_sorting_scan_left_to_right_32s_4k_omp(T, SA, n, k, buckets, 0, threads, thread_state); libsais64_partial_sorting_shift_markers_32s_4k(SA, n); libsais64_partial_sorting_scan_right_to_left_32s_4k_omp(T, SA, n, k, buckets, d, threads, thread_state); libsais64_partial_sorting_gather_lms_suffixes_32s_4k_omp(SA, n, threads, thread_state); } static void libsais64_induce_partial_order_32s_2k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais64_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); libsais64_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); libsais64_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); } static void libsais64_induce_partial_order_32s_1k_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais64_count_suffixes_32s(T, n, k, buckets); libsais64_initialize_buckets_start_32s_1k(k, buckets); libsais64_partial_sorting_scan_left_to_right_32s_1k_omp(T, SA, n, buckets, threads, thread_state); libsais64_count_suffixes_32s(T, n, k, buckets); libsais64_initialize_buckets_end_32s_1k(k, buckets); libsais64_partial_sorting_scan_right_to_left_32s_1k_omp(T, SA, n, buckets, threads, thread_state); libsais64_partial_sorting_gather_lms_suffixes_32s_1k_omp(SA, n, threads, thread_state); } static sa_sint_t libsais64_renumber_lms_suffixes_8u(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais64_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]); libsais64_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]); libsais64_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]); libsais64_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]); sa_sint_t p0 = SA[i + 0]; SAm[(p0 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p0 < 0; sa_sint_t p1 = SA[i + 1]; SAm[(p1 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p1 < 0; sa_sint_t p2 = SA[i + 2]; SAm[(p2 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p2 < 0; sa_sint_t p3 = SA[i + 3]; SAm[(p3 & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p3 < 0; } for (j += prefetch_distance + 3; i < j; i += 1) { sa_sint_t p = SA[i]; SAm[(p & SAINT_MAX) >> 1] = name | SAINT_MIN; name += p < 0; } return name; } static fast_sint_t libsais64_gather_marked_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; l -= 1; fast_sint_t i, j; for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) { libsais64_prefetchr(&SA[i - prefetch_distance]); sa_sint_t s0 = SA[i - 0]; SA[l] = s0 & SAINT_MAX; l -= s0 < 0; sa_sint_t s1 = SA[i - 1]; SA[l] = s1 & SAINT_MAX; l -= s1 < 0; sa_sint_t s2 = SA[i - 2]; SA[l] = s2 & SAINT_MAX; l -= s2 < 0; sa_sint_t s3 = SA[i - 3]; SA[l] = s3 & SAINT_MAX; l -= s3 < 0; } for (j -= 3; i >= j; i -= 1) { sa_sint_t s = SA[i]; SA[l] = s & SAINT_MAX; l -= s < 0; } l += 1; return l; } static sa_sint_t libsais64_renumber_lms_suffixes_8u_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t name = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { name = libsais64_renumber_lms_suffixes_8u(SA, m, 0, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais64_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } if (omp_thread_num == omp_num_threads - 1) { name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); } libsais64_renumber_lms_suffixes_8u(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); } } #endif } return name; } static void libsais64_gather_marked_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; if (omp_num_threads == 1) { libsais64_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { if (omp_thread_num < omp_num_threads - 1) { thread_state[omp_thread_num].state.position = libsais64_gather_marked_lms_suffixes(SA, m, (fast_sint_t)m + omp_block_start + omp_block_size, omp_block_start, omp_block_size); thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size - thread_state[omp_thread_num].state.position; } else { thread_state[omp_thread_num].state.position = libsais64_gather_marked_lms_suffixes(SA, m, (fast_sint_t)n + (fast_sint_t)fs, omp_block_start, omp_block_size); thread_state[omp_thread_num].state.count = (fast_sint_t)n + (fast_sint_t)fs - thread_state[omp_thread_num].state.position; } } #pragma omp barrier #pragma omp master { fast_sint_t t, position = (fast_sint_t)n + (fast_sint_t)fs; for (t = omp_num_threads - 1; t >= 0; --t) { position -= thread_state[t].state.count; if (t != omp_num_threads - 1 && thread_state[t].state.count > 0) { memmove(&SA[position], &SA[thread_state[t].state.position], (size_t)thread_state[t].state.count * sizeof(sa_sint_t)); } } } } #endif } } static sa_sint_t libsais64_renumber_and_gather_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t)); sa_sint_t name = libsais64_renumber_lms_suffixes_8u_omp(SA, m, threads, thread_state); if (name < m) { libsais64_gather_marked_lms_suffixes_omp(SA, n, m, fs, threads, thread_state); } else { fast_sint_t i; for (i = 0; i < m; i += 1) { SA[i] &= SAINT_MAX; } } return name; } static sa_sint_t libsais64_renumber_distinct_lms_suffixes_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t name, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais64_prefetchw(&SA[i + 2 * prefetch_distance]); libsais64_prefetchw(&SAm[(SA[i + prefetch_distance + 0] & SAINT_MAX) >> 1]); libsais64_prefetchw(&SAm[(SA[i + prefetch_distance + 1] & SAINT_MAX) >> 1]); libsais64_prefetchw(&SAm[(SA[i + prefetch_distance + 2] & SAINT_MAX) >> 1]); libsais64_prefetchw(&SAm[(SA[i + prefetch_distance + 3] & SAINT_MAX) >> 1]); p0 = SA[i + 0]; SAm[(SA[i + 0] = p0 & SAINT_MAX) >> 1] = name | (p0 & p3 & SAINT_MIN); name += p0 < 0; p1 = SA[i + 1]; SAm[(SA[i + 1] = p1 & SAINT_MAX) >> 1] = name | (p1 & p0 & SAINT_MIN); name += p1 < 0; p2 = SA[i + 2]; SAm[(SA[i + 2] = p2 & SAINT_MAX) >> 1] = name | (p2 & p1 & SAINT_MIN); name += p2 < 0; p3 = SA[i + 3]; SAm[(SA[i + 3] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0; } for (j += prefetch_distance + 3; i < j; i += 1) { p2 = p3; p3 = SA[i]; SAm[(SA[i] = p3 & SAINT_MAX) >> 1] = name | (p3 & p2 & SAINT_MIN); name += p3 < 0; } return name; } static void libsais64_mark_distinct_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; sa_sint_t p0, p1, p2, p3 = 0; for (i = (fast_sint_t)m + omp_block_start, j = (fast_sint_t)m + omp_block_start + omp_block_size - 3; i < j; i += 4) { libsais64_prefetchw(&SA[i + prefetch_distance]); p0 = SA[i + 0]; SA[i + 0] = p0 & (p3 | SAINT_MAX); p0 = (p0 == 0) ? p3 : p0; p1 = SA[i + 1]; SA[i + 1] = p1 & (p0 | SAINT_MAX); p1 = (p1 == 0) ? p0 : p1; p2 = SA[i + 2]; SA[i + 2] = p2 & (p1 | SAINT_MAX); p2 = (p2 == 0) ? p1 : p2; p3 = SA[i + 3]; SA[i + 3] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3; } for (j += 3; i < j; i += 1) { p2 = p3; p3 = SA[i]; SA[i] = p3 & (p2 | SAINT_MAX); p3 = (p3 == 0) ? p2 : p3; } } static void libsais64_clamp_lms_suffixes_length_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) { libsais64_prefetchw(&SAm[i + prefetch_distance]); SAm[i + 0] = (SAm[i + 0] < 0 ? SAm[i + 0] : 0) & SAINT_MAX; SAm[i + 1] = (SAm[i + 1] < 0 ? SAm[i + 1] : 0) & SAINT_MAX; SAm[i + 2] = (SAm[i + 2] < 0 ? SAm[i + 2] : 0) & SAINT_MAX; SAm[i + 3] = (SAm[i + 3] < 0 ? SAm[i + 3] : 0) & SAINT_MAX; } for (j += 3; i < j; i += 1) { SAm[i] = (SAm[i] < 0 ? SAm[i] : 0) & SAINT_MAX; } } static sa_sint_t libsais64_renumber_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t name = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { name = libsais64_renumber_distinct_lms_suffixes_32s_4k(SA, m, 1, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais64_count_negative_marked_suffixes(SA, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 1; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } if (omp_thread_num == omp_num_threads - 1) { name = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); } libsais64_renumber_distinct_lms_suffixes_32s_4k(SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); } } #endif } return name - 1; } static void libsais64_mark_distinct_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)n >> 1; #endif libsais64_mark_distinct_lms_suffixes_32s(SA, m, omp_block_start, omp_block_size); } } static void libsais64_clamp_lms_suffixes_length_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)n >> 1; #endif libsais64_clamp_lms_suffixes_length_32s(SA, m, omp_block_start, omp_block_size); } } static sa_sint_t libsais64_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { memset(&SA[m], 0, ((size_t)n >> 1) * sizeof(sa_sint_t)); sa_sint_t name = libsais64_renumber_distinct_lms_suffixes_32s_4k_omp(SA, m, threads, thread_state); if (name < m) { libsais64_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads); } return name; } static sa_sint_t libsais64_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; { libsais64_gather_lms_suffixes_32s(T, SA, n); memset(&SA[m], 0, ((size_t)n - (size_t)m - (size_t)m) * sizeof(sa_sint_t)); fast_sint_t i, j; for (i = (fast_sint_t)n - (fast_sint_t)m, j = (fast_sint_t)n - 1 - prefetch_distance - 3; i < j; i += 4) { libsais64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); libsais64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); SAm[((sa_uint_t)SA[i + 0]) >> 1] = SA[i + 1] - SA[i + 0] + 1 + SAINT_MIN; SAm[((sa_uint_t)SA[i + 1]) >> 1] = SA[i + 2] - SA[i + 1] + 1 + SAINT_MIN; SAm[((sa_uint_t)SA[i + 2]) >> 1] = SA[i + 3] - SA[i + 2] + 1 + SAINT_MIN; SAm[((sa_uint_t)SA[i + 3]) >> 1] = SA[i + 4] - SA[i + 3] + 1 + SAINT_MIN; } for (j += prefetch_distance + 3; i < j; i += 1) { SAm[((sa_uint_t)SA[i]) >> 1] = SA[i + 1] - SA[i] + 1 + SAINT_MIN; } SAm[((sa_uint_t)SA[n - 1]) >> 1] = 1 + SAINT_MIN; } { libsais64_clamp_lms_suffixes_length_32s_omp(SA, n, m, threads); } sa_sint_t name = 1; { fast_sint_t i, j, p = SA[0], plen = SAm[p >> 1]; sa_sint_t pdiff = SAINT_MIN; for (i = 1, j = m - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais64_prefetchr(&T[((sa_uint_t)SA[i + prefetch_distance + 0])]); libsais64_prefetchw(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais64_prefetchr(&T[((sa_uint_t)SA[i + prefetch_distance + 1])]); fast_sint_t q = SA[i + 0], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN; if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < qlen); qdiff = (sa_sint_t)(l - qlen) & SAINT_MIN; } SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); p = SA[i + 1]; plen = SAm[p >> 1]; pdiff = SAINT_MIN; if (qlen == plen) { fast_sint_t l = 0; do { if (T[q + l] != T[p + l]) { break; } } while (++l < plen); pdiff = (sa_sint_t)(l - plen) & SAINT_MIN; } SAm[q >> 1] = name | (qdiff & pdiff); name += (pdiff < 0); } for (j += prefetch_distance + 1; i < j; i += 1) { fast_sint_t q = SA[i], qlen = SAm[q >> 1]; sa_sint_t qdiff = SAINT_MIN; if (plen == qlen) { fast_sint_t l = 0; do { if (T[p + l] != T[q + l]) { break; } } while (++l < plen); qdiff = (sa_sint_t)(l - plen) & SAINT_MIN; } SAm[p >> 1] = name | (pdiff & qdiff); name += (qdiff < 0); p = q; plen = qlen; pdiff = qdiff; } SAm[p >> 1] = name | pdiff; name++; } if (name <= m) { libsais64_mark_distinct_lms_suffixes_32s_omp(SA, n, m, threads); } return name - 1; } static void libsais64_reconstruct_lms_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT SAnm = &SA[n - m]; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais64_prefetchw(&SA[i + 2 * prefetch_distance]); libsais64_prefetchr(&SAnm[SA[i + prefetch_distance + 0]]); libsais64_prefetchr(&SAnm[SA[i + prefetch_distance + 1]]); libsais64_prefetchr(&SAnm[SA[i + prefetch_distance + 2]]); libsais64_prefetchr(&SAnm[SA[i + prefetch_distance + 3]]); SA[i + 0] = SAnm[SA[i + 0]]; SA[i + 1] = SAnm[SA[i + 1]]; SA[i + 2] = SAnm[SA[i + 2]]; SA[i + 3] = SAnm[SA[i + 3]]; } for (j += prefetch_distance + 3; i < j; i += 1) { SA[i] = SAnm[SA[i]]; } } static void libsais64_reconstruct_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = m; #endif libsais64_reconstruct_lms_suffixes(SA, n, m, omp_block_start, omp_block_size); } } static void libsais64_place_lms_suffixes_interval_8u(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t flags, sa_sint_t * RESTRICT buckets) { if (flags & LIBSAIS_FLAGS_GSA) { buckets[7 * ALPHABET_SIZE]--; } { const sa_sint_t * RESTRICT bucket_end = &buckets[7 * ALPHABET_SIZE]; fast_sint_t c, j = n; for (c = ALPHABET_SIZE - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } if (flags & LIBSAIS_FLAGS_GSA) { buckets[7 * ALPHABET_SIZE]++; } } static void libsais64_place_lms_suffixes_interval_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1) + BUCKETS_INDEX2(1, 0)] - (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais64_place_lms_suffixes_interval_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { fast_sint_t j = n; if (k > 1) { fast_sint_t c; for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) { fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(1, 1)] - (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)]; if (l > 0) { fast_sint_t i = buckets[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais64_place_lms_suffixes_interval_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t m, sa_sint_t * RESTRICT buckets) { const fast_sint_t prefetch_distance = 64; sa_sint_t c = k - 1; fast_sint_t i, l = buckets[c]; for (i = (fast_sint_t)m - 1; i >= prefetch_distance + 3; i -= 4) { libsais64_prefetchr(&SA[i - 2 * prefetch_distance]); libsais64_prefetchr(&T[SA[i - prefetch_distance - 0]]); libsais64_prefetchr(&T[SA[i - prefetch_distance - 1]]); libsais64_prefetchr(&T[SA[i - prefetch_distance - 2]]); libsais64_prefetchr(&T[SA[i - prefetch_distance - 3]]); sa_sint_t p0 = SA[i - 0]; if (T[p0] != c) { c = T[p0]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p0; sa_sint_t p1 = SA[i - 1]; if (T[p1] != c) { c = T[p1]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p1; sa_sint_t p2 = SA[i - 2]; if (T[p2] != c) { c = T[p2]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p2; sa_sint_t p3 = SA[i - 3]; if (T[p3] != c) { c = T[p3]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p3; } for (; i >= 0; i -= 1) { sa_sint_t p = SA[i]; if (T[p] != c) { c = T[p]; memset(&SA[buckets[c]], 0, (size_t)(l - buckets[c]) * sizeof(sa_sint_t)); l = buckets[c]; } SA[--l] = p; } memset(&SA[0], 0, (size_t)l * sizeof(sa_sint_t)); } static void libsais64_place_lms_suffixes_histogram_32s_6k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { const sa_sint_t * RESTRICT bucket_end = &buckets[5 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX4(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais64_place_lms_suffixes_histogram_32s_4k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { const sa_sint_t * RESTRICT bucket_end = &buckets[3 * (fast_sint_t)k]; fast_sint_t c, j = n; for (c = (fast_sint_t)k - 2; c >= 0; --c) { fast_sint_t l = (fast_sint_t)buckets[BUCKETS_INDEX2(c, 1)]; if (l > 0) { fast_sint_t i = bucket_end[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais64_place_lms_suffixes_histogram_32s_2k(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, const sa_sint_t * RESTRICT buckets) { fast_sint_t j = n; if (k > 1) { fast_sint_t c; for (c = BUCKETS_INDEX2((fast_sint_t)k - 2, 0); c >= BUCKETS_INDEX2(0, 0); c -= BUCKETS_INDEX2(1, 0)) { fast_sint_t l = (fast_sint_t)buckets[c + BUCKETS_INDEX2(0, 1)]; if (l > 0) { fast_sint_t i = buckets[c]; if (j - i > 0) { memset(&SA[i], 0, (size_t)(j - i) * sizeof(sa_sint_t)); } memmove(&SA[j = (i - l)], &SA[m -= (sa_sint_t)l], (size_t)l * sizeof(sa_sint_t)); } } } memset(&SA[0], 0, (size_t)j * sizeof(sa_sint_t)); } static void libsais64_final_bwt_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } static void libsais64_final_bwt_aux_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]]; }} sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]]; }} } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } } } } static void libsais64_final_sorting_scan_left_to_right_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } static void libsais64_final_sorting_scan_left_to_right_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - 2 * prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 3 * prefetch_distance]); sa_sint_t s0 = SA[i + 2 * prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais64_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i + 2 * prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais64_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i + 1 * prefetch_distance + 0]; if (s2 > 0) { libsais64_prefetchw(&induction_bucket[T[s2 - 1]]); libsais64_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i + 1 * prefetch_distance + 1]; if (s3 > 0) { libsais64_prefetchw(&induction_bucket[T[s3 - 1]]); libsais64_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; SA[induction_bucket[T[p0]]++] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; SA[induction_bucket[T[p1]]++] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += 2 * prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static fast_sint_t libsais64_final_bwt_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[i + 0] = T[p0] | SAINT_MIN; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[i + 1] = T[p1] | SAINT_MIN; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[i] = T[p] | SAINT_MIN; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } return count; } static fast_sint_t libsais64_final_sorting_scan_left_to_right_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } return count; } static void libsais64_final_order_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais64_prefetchr(&cache[i + prefetch_distance]); SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; } for (j += 3; i < j; i += 1) { SA[buckets[cache[i].symbol]++] = cache[i].index; } } static void libsais64_final_bwt_aux_scan_left_to_right_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais64_prefetchr(&cache[i + prefetch_distance]); SA[buckets[cache[i + 0].symbol]++] = cache[i + 0].index; if ((cache[i + 0].index & rm) == 0) { I[(cache[i + 0].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 0].symbol]; } SA[buckets[cache[i + 1].symbol]++] = cache[i + 1].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 1].symbol]; } SA[buckets[cache[i + 2].symbol]++] = cache[i + 2].index; if ((cache[i + 2].index & rm) == 0) { I[(cache[i + 2].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 2].symbol]; } SA[buckets[cache[i + 3].symbol]++] = cache[i + 3].index; if ((cache[i + 3].index & rm) == 0) { I[(cache[i + 3].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i + 3].symbol]; } } for (j += 3; i < j; i += 1) { SA[buckets[cache[i].symbol]++] = cache[i].index; if ((cache[i].index & rm) == 0) { I[(cache[i].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol]; } } } static void libsais64_final_sorting_scan_left_to_right_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); libsais64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 ^ SAINT_MIN; if (p0 > 0) { p0--; cache[i + 0].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] < T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 ^ SAINT_MIN; if (p1 > 0) { p1--; cache[i + 1].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] < T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p ^ SAINT_MIN; if (p > 0) { p--; cache[i].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol; } } static void libsais64_final_sorting_scan_left_to_right_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, omp_block_end = omp_block_start + omp_block_size; for (i = omp_block_start, j = omp_block_end - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&cache[i + 2 * prefetch_distance]); sa_sint_t s0 = cache[i + prefetch_distance + 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais64_prefetchw(Is0); sa_sint_t s1 = cache[i + prefetch_distance + 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais64_prefetchw(Is1); sa_sint_t v0 = cache[i + 0].symbol; if (v0 >= 0) { cache[i + 0].symbol = induction_bucket[v0]++; if (cache[i + 0].symbol < omp_block_end) { sa_sint_t ni = cache[i + 0].symbol, np = cache[i + 0].index; cache[i + 0].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } sa_sint_t v1 = cache[i + 1].symbol; if (v1 >= 0) { cache[i + 1].symbol = induction_bucket[v1]++; if (cache[i + 1].symbol < omp_block_end) { sa_sint_t ni = cache[i + 1].symbol, np = cache[i + 1].index; cache[i + 1].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = induction_bucket[v]++; if (cache[i].symbol < omp_block_end) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np ^ SAINT_MIN; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] < T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } } static void libsais64_final_bwt_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais64_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais64_final_bwt_scan_left_to_right_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais64_final_order_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais64_final_bwt_aux_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais64_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais64_final_bwt_scan_left_to_right_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais64_final_bwt_aux_scan_left_to_right_8u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais64_final_sorting_scan_left_to_right_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais64_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais64_final_sorting_scan_left_to_right_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = 0; t < omp_num_threads; ++t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A + B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais64_final_order_scan_left_to_right_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais64_final_sorting_scan_left_to_right_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais64_final_sorting_scan_left_to_right_32s(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais64_final_sorting_scan_left_to_right_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais64_final_sorting_scan_left_to_right_32s_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static void libsais64_final_bwt_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | (sa_sint_t)((sa_uint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais64_final_bwt_scan_left_to_right_8u(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < n; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } else { libsais64_final_bwt_scan_left_to_right_8u_block_omp(T, SA, k, induction_bucket, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais64_final_bwt_aux_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | (sa_sint_t)((sa_uint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); if ((((sa_sint_t)n - 1) & rm) == 0) { I[((sa_sint_t)n - 1) / (rm + 1)] = induction_bucket[T[(sa_sint_t)n - 1]]; } if (threads == 1 || n < 65536) { libsais64_final_bwt_aux_scan_left_to_right_8u(T, SA, rm, I, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < n; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[block_start] = T[p] | SAINT_MIN; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]]; } } } } else { libsais64_final_bwt_aux_scan_left_to_right_8u_block_omp(T, SA, k, rm, I, induction_bucket, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais64_final_sorting_scan_left_to_right_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[(sa_sint_t)n - 1]]++] = ((sa_sint_t)n - 1) | (sa_sint_t)((sa_uint_t)(T[(sa_sint_t)n - 2] < T[(sa_sint_t)n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais64_final_sorting_scan_left_to_right_8u(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = 0; block_start < n; ) { if (SA[block_start] == 0) { block_start++; } else { fast_sint_t block_max_end = block_start + ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end > n) { block_max_end = n;} fast_sint_t block_end = block_start + 1; while (block_end < block_max_end && SA[block_end] != 0) { block_end++; } fast_sint_t block_size = block_end - block_start; if (block_size < 32) { for (; block_start < block_end; block_start += 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p ^ SAINT_MIN; if (p > 0) { p--; SA[induction_bucket[T[p]]++] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] < T[p]) << (SAINT_BIT - 1)); } } } else { libsais64_final_sorting_scan_left_to_right_8u_block_omp(T, SA, k, induction_bucket, block_start, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais64_final_sorting_scan_left_to_right_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { SA[induction_bucket[T[n - 1]]++] = (n - 1) | (sa_sint_t)((sa_uint_t)(T[n - 2] < T[n - 1]) << (SAINT_BIT - 1)); if (threads == 1 || n < 65536) { libsais64_final_sorting_scan_left_to_right_32s(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = 0; block_start < n; block_start = block_end) { block_end = block_start + (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end > n) { block_end = n; } libsais64_final_sorting_scan_left_to_right_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_start, block_end - block_start, threads); } } #else UNUSED(thread_state); #endif } static sa_sint_t libsais64_final_bwt_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; sa_sint_t index = -1; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; index = (p0 == 0) ? (sa_sint_t)(i - 0) : index; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; } sa_sint_t p1 = SA[i - 1]; index = (p1 == 0) ? (sa_sint_t)(i - 1) : index; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; index = (p == 0) ? (sa_sint_t)i : index; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } } return index; } static void libsais64_final_bwt_aux_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p0 : t; if ((p0 & rm) == 0) { I[p0 / (rm + 1)] = induction_bucket[T[p0]] + 1; } } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p1 : t; if ((p1 & rm) == 0) { I[p1 / (rm + 1)] = induction_bucket[T[p1]] + 1; } } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } } } } static void libsais64_final_sorting_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } static void libsais64_final_gsa_scan_right_to_left_8u(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0 && T[p0 - 1] > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0 && T[p1 - 1] > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0 && T[p - 1] > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } static void libsais64_final_sorting_scan_right_to_left_32s(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT induction_bucket, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + 2 * prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&SA[i - 3 * prefetch_distance]); sa_sint_t s0 = SA[i - 2 * prefetch_distance - 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 1]; libsais64_prefetchr(Ts0 - 1); sa_sint_t s1 = SA[i - 2 * prefetch_distance - 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 1]; libsais64_prefetchr(Ts1 - 1); sa_sint_t s2 = SA[i - 1 * prefetch_distance - 0]; if (s2 > 0) { libsais64_prefetchw(&induction_bucket[T[s2 - 1]]); libsais64_prefetchr(&T[s2] - 2); } sa_sint_t s3 = SA[i - 1 * prefetch_distance - 1]; if (s3 > 0) { libsais64_prefetchw(&induction_bucket[T[s3 - 1]]); libsais64_prefetchr(&T[s3] - 2); } sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; SA[--induction_bucket[T[p0]]] = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; SA[--induction_bucket[T[p1]]] = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= 2 * prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } #if defined(LIBSAIS_OPENMP) static fast_sint_t libsais64_final_bwt_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p0 : t; } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p1 : t; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count++].index = (c0 <= c1) ? p : t; } } return count; } static fast_sint_t libsais64_final_bwt_aux_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; uint8_t c0 = T[p0 - (p0 > 0)], c1 = T[p0]; SA[i - 0] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p0 : t; cache[count + 1].index = p0; count += 2; } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; uint8_t c0 = T[p1 - (p1 > 0)], c1 = T[p1]; SA[i - 1] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p1 : t; cache[count + 1].index = p1; count += 2; } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[i] = c1; sa_sint_t t = c0 | SAINT_MIN; buckets[cache[count].symbol = c1]++; cache[count].index = (c0 <= c1) ? p : t; cache[count + 1].index = p; count += 2; } } return count; } static fast_sint_t libsais64_final_sorting_scan_right_to_left_8u_block_prepare(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; memset(buckets, 0, (size_t)k * sizeof(sa_sint_t)); fast_sint_t i, j, count = 0; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&SA[i - 2 * prefetch_distance]); sa_sint_t s0 = SA[i - prefetch_distance - 0]; const uint8_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i - prefetch_distance - 1]; const uint8_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); sa_sint_t p0 = SA[i - 0]; SA[i - 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; buckets[cache[count].symbol = T[p0]]++; cache[count++].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); } sa_sint_t p1 = SA[i - 1]; SA[i - 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; buckets[cache[count].symbol = T[p1]]++; cache[count++].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; buckets[cache[count].symbol = T[p]]++; cache[count++].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } return count; } static void libsais64_final_order_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais64_prefetchr(&cache[i + prefetch_distance]); SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index; SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index; } for (j += 3; i < j; i += 1) { SA[--buckets[cache[i].symbol]] = cache[i].index; } } static void libsais64_final_gsa_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 3; i < j; i += 4) { libsais64_prefetchr(&cache[i + prefetch_distance]); if (cache[i + 0].symbol > 0) { SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; } if (cache[i + 1].symbol > 0) { SA[--buckets[cache[i + 1].symbol]] = cache[i + 1].index; } if (cache[i + 2].symbol > 0) { SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; } if (cache[i + 3].symbol > 0) { SA[--buckets[cache[i + 3].symbol]] = cache[i + 3].index; } } for (j += 3; i < j; i += 1) { if (cache[i].symbol > 0) { SA[--buckets[cache[i].symbol]] = cache[i].index; } } } static void libsais64_final_bwt_aux_scan_right_to_left_8u_block_place(sa_sint_t * RESTRICT SA, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t count) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = count - 6; i < j; i += 8) { libsais64_prefetchr(&cache[i + prefetch_distance]); SA[--buckets[cache[i + 0].symbol]] = cache[i + 0].index; if ((cache[i + 1].index & rm) == 0) { I[cache[i + 1].index / (rm + 1)] = buckets[cache[i + 0].symbol] + 1; } SA[--buckets[cache[i + 2].symbol]] = cache[i + 2].index; if ((cache[i + 3].index & rm) == 0) { I[cache[i + 3].index / (rm + 1)] = buckets[cache[i + 2].symbol] + 1; } SA[--buckets[cache[i + 4].symbol]] = cache[i + 4].index; if ((cache[i + 5].index & rm) == 0) { I[cache[i + 5].index / (rm + 1)] = buckets[cache[i + 4].symbol] + 1; } SA[--buckets[cache[i + 6].symbol]] = cache[i + 6].index; if ((cache[i + 7].index & rm) == 0) { I[cache[i + 7].index / (rm + 1)] = buckets[cache[i + 6].symbol] + 1; } } for (j += 6; i < j; i += 2) { SA[--buckets[cache[i].symbol]] = cache[i].index; if ((cache[i + 1].index & rm) == 0) { I[(cache[i + 1].index & SAINT_MAX) / (rm + 1)] = buckets[cache[i].symbol] + 1; } } } static void libsais64_final_sorting_scan_right_to_left_32s_block_gather(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 1; i < j; i += 2) { libsais64_prefetchw(&SA[i + 2 * prefetch_distance]); sa_sint_t s0 = SA[i + prefetch_distance + 0]; const sa_sint_t * Ts0 = &T[s0 > 0 ? s0 : 2]; libsais64_prefetchr(Ts0 - 1); libsais64_prefetchr(Ts0 - 2); sa_sint_t s1 = SA[i + prefetch_distance + 1]; const sa_sint_t * Ts1 = &T[s1 > 0 ? s1 : 2]; libsais64_prefetchr(Ts1 - 1); libsais64_prefetchr(Ts1 - 2); libsais64_prefetchw(&cache[i + prefetch_distance]); sa_sint_t symbol0 = SAINT_MIN, p0 = SA[i + 0]; SA[i + 0] = p0 & SAINT_MAX; if (p0 > 0) { p0--; cache[i + 0].index = p0 | (sa_sint_t)((sa_uint_t)(T[p0 - (p0 > 0)] > T[p0]) << (SAINT_BIT - 1)); symbol0 = T[p0]; } cache[i + 0].symbol = symbol0; sa_sint_t symbol1 = SAINT_MIN, p1 = SA[i + 1]; SA[i + 1] = p1 & SAINT_MAX; if (p1 > 0) { p1--; cache[i + 1].index = p1 | (sa_sint_t)((sa_uint_t)(T[p1 - (p1 > 0)] > T[p1]) << (SAINT_BIT - 1)); symbol1 = T[p1]; } cache[i + 1].symbol = symbol1; } for (j += prefetch_distance + 1; i < j; i += 1) { sa_sint_t symbol = SAINT_MIN, p = SA[i]; SA[i] = p & SAINT_MAX; if (p > 0) { p--; cache[i].index = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); symbol = T[p]; } cache[i].symbol = symbol; } } static void libsais64_final_sorting_scan_right_to_left_32s_block_sort(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT induction_bucket, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start + prefetch_distance + 1; i >= j; i -= 2) { libsais64_prefetchw(&cache[i - 2 * prefetch_distance]); sa_sint_t s0 = cache[i - prefetch_distance - 0].symbol; const sa_sint_t * Is0 = &induction_bucket[s0 > 0 ? s0 : 0]; libsais64_prefetchw(Is0); sa_sint_t s1 = cache[i - prefetch_distance - 1].symbol; const sa_sint_t * Is1 = &induction_bucket[s1 > 0 ? s1 : 0]; libsais64_prefetchw(Is1); sa_sint_t v0 = cache[i - 0].symbol; if (v0 >= 0) { cache[i - 0].symbol = --induction_bucket[v0]; if (cache[i - 0].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 0].symbol, np = cache[i - 0].index; cache[i - 0].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } sa_sint_t v1 = cache[i - 1].symbol; if (v1 >= 0) { cache[i - 1].symbol = --induction_bucket[v1]; if (cache[i - 1].symbol >= omp_block_start) { sa_sint_t ni = cache[i - 1].symbol, np = cache[i - 1].index; cache[i - 1].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } for (j -= prefetch_distance + 1; i >= j; i -= 1) { sa_sint_t v = cache[i].symbol; if (v >= 0) { cache[i].symbol = --induction_bucket[v]; if (cache[i].symbol >= omp_block_start) { sa_sint_t ni = cache[i].symbol, np = cache[i].index; cache[i].index = np & SAINT_MAX; if (np > 0) { np--; cache[ni].index = np | (sa_sint_t)((sa_uint_t)(T[np - (np > 0)] > T[np]) << (SAINT_BIT - 1)); cache[ni].symbol = T[np]; } } } } } static void libsais64_final_bwt_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais64_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais64_final_bwt_scan_right_to_left_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais64_final_order_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais64_final_bwt_aux_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais64_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais64_final_bwt_aux_scan_right_to_left_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais64_final_bwt_aux_scan_right_to_left_8u_block_place(SA, rm, I, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais64_final_sorting_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais64_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais64_final_sorting_scan_right_to_left_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais64_final_order_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais64_final_gsa_scan_right_to_left_8u_block_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 64 * (k > 256 ? k : 256) && omp_get_dynamic() == 0) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(k); UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais64_final_gsa_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais64_final_sorting_scan_right_to_left_8u_block_prepare(T, SA, k, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t; for (t = omp_num_threads - 1; t >= 0; --t) { sa_sint_t * RESTRICT temp_bucket = thread_state[t].state.buckets; fast_sint_t c; for (c = 0; c < k; c += 1) { sa_sint_t A = induction_bucket[c], B = temp_bucket[c]; induction_bucket[c] = A - B; temp_bucket[c] = A; } } } #pragma omp barrier { libsais64_final_gsa_scan_right_to_left_8u_block_place(SA, thread_state[omp_thread_num].state.buckets, thread_state[omp_thread_num].state.cache, thread_state[omp_thread_num].state.count); } } #endif } } static void libsais64_final_sorting_scan_right_to_left_32s_block_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT buckets, LIBSAIS_THREAD_CACHE * RESTRICT cache, fast_sint_t block_start, fast_sint_t block_size, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && block_size >= 16384) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(cache); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; omp_block_start += block_start; if (omp_num_threads == 1) { libsais64_final_sorting_scan_right_to_left_32s(T, SA, buckets, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { libsais64_final_sorting_scan_right_to_left_32s_block_gather(T, SA, cache - block_start, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { libsais64_final_sorting_scan_right_to_left_32s_block_sort(T, buckets, cache - block_start, block_start, block_size); } #pragma omp barrier { libsais64_compact_and_place_cached_suffixes(SA, cache - block_start, omp_block_start, omp_block_size); } } #endif } } #endif static sa_sint_t libsais64_final_bwt_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t index = -1; if (threads == 1 || n < 65536) { index = libsais64_final_bwt_scan_right_to_left_8u(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) { if (SA[block_start] == 0) { index = (sa_sint_t)block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < 0) { block_max_end = -1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; } } } else { libsais64_final_bwt_scan_right_to_left_8u_block_omp(T, SA, k, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif return index; } static void libsais64_final_bwt_aux_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t rm, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { libsais64_final_bwt_aux_scan_right_to_left_8u(T, SA, rm, I, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = (fast_sint_t)n - 1; block_start >= 0; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * ((LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads) / 2); if (block_max_end < 0) { block_max_end = -1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; uint8_t c0 = T[p - (p > 0)], c1 = T[p]; SA[block_start] = c1; sa_sint_t t = c0 | SAINT_MIN; SA[--induction_bucket[c1]] = (c0 <= c1) ? p : t; if ((p & rm) == 0) { I[p / (rm + 1)] = induction_bucket[T[p]] + 1; } } } } else { libsais64_final_bwt_aux_scan_right_to_left_8u_block_omp(T, SA, k, rm, I, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais64_final_sorting_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || omp_block_size < 65536) { libsais64_final_sorting_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = omp_block_start + omp_block_size - 1; block_start >= omp_block_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < omp_block_start) { block_max_end = omp_block_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } else { libsais64_final_sorting_scan_right_to_left_8u_block_omp(T, SA, k, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais64_final_gsa_scan_right_to_left_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, fast_sint_t omp_block_start, fast_sint_t omp_block_size, sa_sint_t k, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || omp_block_size < 65536) { libsais64_final_gsa_scan_right_to_left_8u(T, SA, induction_bucket, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start; for (block_start = omp_block_start + omp_block_size - 1; block_start >= omp_block_start; ) { if (SA[block_start] == 0) { block_start--; } else { fast_sint_t block_max_end = block_start - ((fast_sint_t)threads) * (LIBSAIS_PER_THREAD_CACHE_SIZE - 16 * (fast_sint_t)threads); if (block_max_end < omp_block_start) { block_max_end = omp_block_start - 1; } fast_sint_t block_end = block_start - 1; while (block_end > block_max_end && SA[block_end] != 0) { block_end--; } fast_sint_t block_size = block_start - block_end; if (block_size < 32) { for (; block_start > block_end; block_start -= 1) { sa_sint_t p = SA[block_start]; SA[block_start] = p & SAINT_MAX; if (p > 0 && T[p - 1] > 0) { p--; SA[--induction_bucket[T[p]]] = p | (sa_sint_t)((sa_uint_t)(T[p - (p > 0)] > T[p]) << (SAINT_BIT - 1)); } } } else { libsais64_final_gsa_scan_right_to_left_8u_block_omp(T, SA, k, induction_bucket, block_end + 1, block_size, threads, thread_state); block_start = block_end; } } } } #else UNUSED(k); UNUSED(thread_state); #endif } static void libsais64_final_sorting_scan_right_to_left_32s_omp(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t * RESTRICT induction_bucket, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (threads == 1 || n < 65536) { libsais64_final_sorting_scan_right_to_left_32s(T, SA, induction_bucket, 0, n); } #if defined(LIBSAIS_OPENMP) else { fast_sint_t block_start, block_end; for (block_start = (fast_sint_t)n - 1; block_start >= 0; block_start = block_end) { block_end = block_start - (fast_sint_t)threads * LIBSAIS_PER_THREAD_CACHE_SIZE; if (block_end < 0) { block_end = -1; } libsais64_final_sorting_scan_right_to_left_32s_block_omp(T, SA, induction_bucket, thread_state[0].state.cache, block_end + 1, block_start - block_end, threads); } } #else UNUSED(thread_state); #endif } static void libsais64_clear_lms_suffixes_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT bucket_start, sa_sint_t * RESTRICT bucket_end, sa_sint_t threads) { fast_sint_t c; #if defined(LIBSAIS_OPENMP) #pragma omp parallel for schedule(static, 1) num_threads(threads) if(threads > 1 && n >= 65536) #else UNUSED(threads); UNUSED(n); #endif for (c = 0; c < k; ++c) { if (bucket_end[c] > bucket_start[c]) { memset(&SA[bucket_start[c]], 0, ((size_t)bucket_end[c] - (size_t)bucket_start[c]) * sizeof(sa_sint_t)); } } } static sa_sint_t libsais64_induce_final_order_8u_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t flags, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if ((flags & LIBSAIS_FLAGS_BWT) == 0) { if (flags & LIBSAIS_FLAGS_GSA) { buckets[6 * ALPHABET_SIZE] = buckets[7 * ALPHABET_SIZE] - 1; } libsais64_final_sorting_scan_left_to_right_8u_omp(T, SA, n, k, &buckets[6 * ALPHABET_SIZE], threads, thread_state); if (threads > 1 && n >= 65536) { libsais64_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } if (flags & LIBSAIS_FLAGS_GSA) { libsais64_flip_suffix_markers_omp(SA, buckets[7 * ALPHABET_SIZE], threads); libsais64_final_gsa_scan_right_to_left_8u_omp(T, SA, buckets[7 * ALPHABET_SIZE], (fast_sint_t)n - buckets[7 * ALPHABET_SIZE], k, &buckets[7 * ALPHABET_SIZE], threads, thread_state); } else { libsais64_final_sorting_scan_right_to_left_8u_omp(T, SA, 0, n, k, &buckets[7 * ALPHABET_SIZE], threads, thread_state); } return 0; } else if (I != NULL) { libsais64_final_bwt_aux_scan_left_to_right_8u_omp(T, SA, n, k, r - 1, I, &buckets[6 * ALPHABET_SIZE], threads, thread_state); if (threads > 1 && n >= 65536) { libsais64_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } libsais64_final_bwt_aux_scan_right_to_left_8u_omp(T, SA, n, k, r - 1, I, &buckets[7 * ALPHABET_SIZE], threads, thread_state); return 0; } else { libsais64_final_bwt_scan_left_to_right_8u_omp(T, SA, n, k, &buckets[6 * ALPHABET_SIZE], threads, thread_state); if (threads > 1 && n >= 65536) { libsais64_clear_lms_suffixes_omp(SA, n, ALPHABET_SIZE, &buckets[6 * ALPHABET_SIZE], &buckets[7 * ALPHABET_SIZE], threads); } return libsais64_final_bwt_scan_right_to_left_8u_omp(T, SA, n, k, &buckets[7 * ALPHABET_SIZE], threads, thread_state); } } static void libsais64_induce_final_order_32s_6k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[4 * (fast_sint_t)k], threads, thread_state); libsais64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[5 * (fast_sint_t)k], threads, thread_state); } static void libsais64_induce_final_order_32s_4k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[2 * (fast_sint_t)k], threads, thread_state); libsais64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[3 * (fast_sint_t)k], threads, thread_state); } static void libsais64_induce_final_order_32s_2k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, &buckets[1 * (fast_sint_t)k], threads, thread_state); libsais64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, &buckets[0 * (fast_sint_t)k], threads, thread_state); } static void libsais64_induce_final_order_32s_1k(const sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t * RESTRICT buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais64_count_suffixes_32s(T, n, k, buckets); libsais64_initialize_buckets_start_32s_1k(k, buckets); libsais64_final_sorting_scan_left_to_right_32s_omp(T, SA, n, buckets, threads, thread_state); libsais64_count_suffixes_32s(T, n, k, buckets); libsais64_initialize_buckets_end_32s_1k(k, buckets); libsais64_final_sorting_scan_right_to_left_32s_omp(T, SA, n, buckets, threads, thread_state); } static sa_sint_t libsais64_renumber_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t f, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; sa_sint_t i, j; for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 2 * (sa_sint_t)prefetch_distance - 3; i < j; i += 4) { libsais64_prefetchr(&SA[i + 3 * prefetch_distance]); libsais64_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 0]) >> 1]); libsais64_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 1]) >> 1]); libsais64_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 2]) >> 1]); libsais64_prefetchw(&SAm[((sa_uint_t)SA[i + 2 * prefetch_distance + 3]) >> 1]); sa_uint_t q0 = (sa_uint_t)SA[i + prefetch_distance + 0]; sa_sint_t * RESTRICT Tq0 = &T[q0]; libsais64_prefetchw(SAm[q0 >> 1] < 0 ? Tq0 : &SAm[q0 >> 1]); sa_uint_t q1 = (sa_uint_t)SA[i + prefetch_distance + 1]; sa_sint_t * RESTRICT Tq1 = &T[q1]; libsais64_prefetchw(SAm[q1 >> 1] < 0 ? Tq1 : &SAm[q1 >> 1]); sa_uint_t q2 = (sa_uint_t)SA[i + prefetch_distance + 2]; sa_sint_t * RESTRICT Tq2 = &T[q2]; libsais64_prefetchw(SAm[q2 >> 1] < 0 ? Tq2 : &SAm[q2 >> 1]); sa_uint_t q3 = (sa_uint_t)SA[i + prefetch_distance + 3]; sa_sint_t * RESTRICT Tq3 = &T[q3]; libsais64_prefetchw(SAm[q3 >> 1] < 0 ? Tq3 : &SAm[q3 >> 1]); sa_uint_t p0 = (sa_uint_t)SA[i + 0]; sa_sint_t s0 = SAm[p0 >> 1]; if (s0 < 0) { T[p0] |= SAINT_MIN; f++; s0 = i + 0 + SAINT_MIN + f; } SAm[p0 >> 1] = s0 - f; sa_uint_t p1 = (sa_uint_t)SA[i + 1]; sa_sint_t s1 = SAm[p1 >> 1]; if (s1 < 0) { T[p1] |= SAINT_MIN; f++; s1 = i + 1 + SAINT_MIN + f; } SAm[p1 >> 1] = s1 - f; sa_uint_t p2 = (sa_uint_t)SA[i + 2]; sa_sint_t s2 = SAm[p2 >> 1]; if (s2 < 0) { T[p2] |= SAINT_MIN; f++; s2 = i + 2 + SAINT_MIN + f; } SAm[p2 >> 1] = s2 - f; sa_uint_t p3 = (sa_uint_t)SA[i + 3]; sa_sint_t s3 = SAm[p3 >> 1]; if (s3 < 0) { T[p3] |= SAINT_MIN; f++; s3 = i + 3 + SAINT_MIN + f; } SAm[p3 >> 1] = s3 - f; } for (j += 2 * (sa_sint_t)prefetch_distance + 3; i < j; i += 1) { sa_uint_t p = (sa_uint_t)SA[i]; sa_sint_t s = SAm[p >> 1]; if (s < 0) { T[p] |= SAINT_MIN; f++; s = i + SAINT_MIN + f; } SAm[p >> 1] = s - f; } return f; } static void libsais64_compact_unique_and_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t * pl, fast_sint_t * pr, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_uint_t * RESTRICT SAl = (sa_uint_t *)&SA[0]; sa_uint_t * RESTRICT SAr = (sa_uint_t *)&SA[0]; fast_sint_t i, j, l = *pl - 1, r = *pr - 1; for (i = (fast_sint_t)m + omp_block_start + omp_block_size - 1, j = (fast_sint_t)m + omp_block_start + 3; i >= j; i -= 4) { libsais64_prefetchr(&SA[i - prefetch_distance]); sa_uint_t p0 = (sa_uint_t)SA[i - 0]; SAl[l] = p0 & SAINT_MAX; l -= (sa_sint_t)p0 < 0; SAr[r] = p0 - 1; r -= (sa_sint_t)p0 > 0; sa_uint_t p1 = (sa_uint_t)SA[i - 1]; SAl[l] = p1 & SAINT_MAX; l -= (sa_sint_t)p1 < 0; SAr[r] = p1 - 1; r -= (sa_sint_t)p1 > 0; sa_uint_t p2 = (sa_uint_t)SA[i - 2]; SAl[l] = p2 & SAINT_MAX; l -= (sa_sint_t)p2 < 0; SAr[r] = p2 - 1; r -= (sa_sint_t)p2 > 0; sa_uint_t p3 = (sa_uint_t)SA[i - 3]; SAl[l] = p3 & SAINT_MAX; l -= (sa_sint_t)p3 < 0; SAr[r] = p3 - 1; r -= (sa_sint_t)p3 > 0; } for (j -= 3; i >= j; i -= 1) { sa_uint_t p = (sa_uint_t)SA[i]; SAl[l] = p & SAINT_MAX; l -= (sa_sint_t)p < 0; SAr[r] = p - 1; r -= (sa_sint_t)p > 0; } *pl = l + 1; *pr = r + 1; } #if defined(LIBSAIS_OPENMP) static sa_sint_t libsais64_count_unique_suffixes(sa_sint_t * RESTRICT SA, sa_sint_t m, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; sa_sint_t * RESTRICT SAm = &SA[m]; fast_sint_t i, j; sa_sint_t f0 = 0, f1 = 0, f2 = 0, f3 = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais64_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 0]) >> 1]); libsais64_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 1]) >> 1]); libsais64_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 2]) >> 1]); libsais64_prefetchr(&SAm[((sa_uint_t)SA[i + prefetch_distance + 3]) >> 1]); f0 += SAm[((sa_uint_t)SA[i + 0]) >> 1] < 0; f1 += SAm[((sa_uint_t)SA[i + 1]) >> 1] < 0; f2 += SAm[((sa_uint_t)SA[i + 2]) >> 1] < 0; f3 += SAm[((sa_uint_t)SA[i + 3]) >> 1] < 0; } for (j += prefetch_distance + 3; i < j; i += 1) { f0 += SAm[((sa_uint_t)SA[i]) >> 1] < 0; } return f0 + f1 + f2 + f3; } #endif static sa_sint_t libsais64_renumber_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t f = 0; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { f = libsais64_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, 0, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais64_count_unique_suffixes(SA, m, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } if (omp_thread_num == omp_num_threads - 1) { f = (sa_sint_t)(count + thread_state[omp_thread_num].state.count); } libsais64_renumber_unique_and_nonunique_lms_suffixes_32s(T, SA, m, (sa_sint_t)count, omp_block_start, omp_block_size); } } #endif } return f; } static void libsais64_compact_unique_and_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 131072 && m < fs) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (((fast_sint_t)n >> 1) / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : ((fast_sint_t)n >> 1) - omp_block_start; if (omp_num_threads == 1) { fast_sint_t l = m, r = (fast_sint_t)n + (fast_sint_t)fs; libsais64_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &l, &r, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.position = (fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_start + omp_block_size; thread_state[omp_thread_num].state.count = (fast_sint_t)m + omp_block_start + omp_block_size; libsais64_compact_unique_and_nonunique_lms_suffixes_32s(SA, m, &thread_state[omp_thread_num].state.position, &thread_state[omp_thread_num].state.count, omp_block_start, omp_block_size); } #pragma omp barrier #pragma omp master { fast_sint_t t, position; for (position = m, t = omp_num_threads - 1; t >= 0; --t) { fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1); fast_sint_t count = ((fast_sint_t)m + ((fast_sint_t)n >> 1) + omp_block_end - thread_state[t].state.position); if (count > 0) { position -= count; memcpy(&SA[position], &SA[thread_state[t].state.position], (size_t)count * sizeof(sa_sint_t)); } } for (position = (fast_sint_t)n + (fast_sint_t)fs, t = omp_num_threads - 1; t >= 0; --t) { fast_sint_t omp_block_end = t < omp_num_threads - 1 ? omp_block_stride * (t + 1) : ((fast_sint_t)n >> 1); fast_sint_t count = ((fast_sint_t)m + omp_block_end - thread_state[t].state.count); if (count > 0) { position -= count; memcpy(&SA[position], &SA[thread_state[t].state.count], (size_t)count * sizeof(sa_sint_t)); } } } } #endif } memcpy(&SA[(fast_sint_t)n + (fast_sint_t)fs - (fast_sint_t)m], &SA[(fast_sint_t)m - (fast_sint_t)f], (size_t)f * sizeof(sa_sint_t)); } static sa_sint_t libsais64_compact_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t f = libsais64_renumber_unique_and_nonunique_lms_suffixes_32s_omp(T, SA, m, threads, thread_state); libsais64_compact_unique_and_nonunique_lms_suffixes_32s_omp(SA, n, m, fs, f, threads, thread_state); return f; } static void libsais64_merge_unique_lms_suffixes_32s(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l]; sa_sint_t i, j; fast_sint_t tmp = *SAnm++; for (i = (sa_sint_t)omp_block_start, j = (sa_sint_t)omp_block_start + (sa_sint_t)omp_block_size - 6; i < j; i += 4) { libsais64_prefetchr(&T[i + prefetch_distance]); sa_sint_t c0 = T[i + 0]; if (c0 < 0) { T[i + 0] = c0 & SAINT_MAX; SA[tmp] = i + 0; i++; tmp = *SAnm++; } sa_sint_t c1 = T[i + 1]; if (c1 < 0) { T[i + 1] = c1 & SAINT_MAX; SA[tmp] = i + 1; i++; tmp = *SAnm++; } sa_sint_t c2 = T[i + 2]; if (c2 < 0) { T[i + 2] = c2 & SAINT_MAX; SA[tmp] = i + 2; i++; tmp = *SAnm++; } sa_sint_t c3 = T[i + 3]; if (c3 < 0) { T[i + 3] = c3 & SAINT_MAX; SA[tmp] = i + 3; i++; tmp = *SAnm++; } } for (j += 6; i < j; i += 1) { sa_sint_t c = T[i]; if (c < 0) { T[i] = c & SAINT_MAX; SA[tmp] = i; i++; tmp = *SAnm++; } } } static void libsais64_merge_nonunique_lms_suffixes_32s(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, fast_sint_t l, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; const sa_sint_t * RESTRICT SAnm = &SA[(fast_sint_t)n - (fast_sint_t)m - 1 + l]; fast_sint_t i, j; sa_sint_t tmp = *SAnm++; for (i = omp_block_start, j = omp_block_start + omp_block_size - 3; i < j; i += 4) { libsais64_prefetchr(&SA[i + prefetch_distance]); if (SA[i + 0] == 0) { SA[i + 0] = tmp; tmp = *SAnm++; } if (SA[i + 1] == 0) { SA[i + 1] = tmp; tmp = *SAnm++; } if (SA[i + 2] == 0) { SA[i + 2] = tmp; tmp = *SAnm++; } if (SA[i + 3] == 0) { SA[i + 3] = tmp; tmp = *SAnm++; } } for (j += 3; i < j; i += 1) { if (SA[i] == 0) { SA[i] = tmp; tmp = *SAnm++; } } } static void libsais64_merge_unique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; if (omp_num_threads == 1) { libsais64_merge_unique_lms_suffixes_32s(T, SA, n, m, 0, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais64_count_negative_marked_suffixes(T, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = 0; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } libsais64_merge_unique_lms_suffixes_32s(T, SA, n, m, count, omp_block_start, omp_block_size); } } #endif } } static void libsais64_merge_nonunique_lms_suffixes_32s_omp(sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && m >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); UNUSED(thread_state); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (m / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : m - omp_block_start; if (omp_num_threads == 1) { libsais64_merge_nonunique_lms_suffixes_32s(SA, n, m, f, omp_block_start, omp_block_size); } #if defined(LIBSAIS_OPENMP) else { { thread_state[omp_thread_num].state.count = libsais64_count_zero_marked_suffixes(SA, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t t, count = f; for (t = 0; t < omp_thread_num; ++t) { count += thread_state[t].state.count; } libsais64_merge_nonunique_lms_suffixes_32s(SA, n, m, count, omp_block_start, omp_block_size); } } #endif } } static void libsais64_merge_compacted_lms_suffixes_32s_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { libsais64_merge_unique_lms_suffixes_32s_omp(T, SA, n, m, threads, thread_state); libsais64_merge_nonunique_lms_suffixes_32s_omp(SA, n, m, f, threads, thread_state); } static void libsais64_reconstruct_compacted_lms_suffixes_32s_2k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t * RESTRICT buckets, sa_sint_t local_buckets, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (f > 0) { memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t)); libsais64_count_and_gather_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, local_buckets, threads, thread_state); libsais64_reconstruct_lms_suffixes_omp(SA, n, m - f, threads); memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t)); memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t)); libsais64_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state); } else { libsais64_count_and_gather_lms_suffixes_32s_2k(T, SA, n, k, buckets, 0, n); libsais64_reconstruct_lms_suffixes_omp(SA, n, m, threads); } } static void libsais64_reconstruct_compacted_lms_suffixes_32s_1k_omp(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t m, sa_sint_t fs, sa_sint_t f, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { if (f > 0) { memmove(&SA[n - m - 1], &SA[n + fs - m], (size_t)f * sizeof(sa_sint_t)); libsais64_gather_compacted_lms_suffixes_32s(T, SA, n); libsais64_reconstruct_lms_suffixes_omp(SA, n, m - f, threads); memcpy(&SA[n - m - 1 + f], &SA[0], ((size_t)m - (size_t)f) * sizeof(sa_sint_t)); memset(&SA[0], 0, (size_t)m * sizeof(sa_sint_t)); libsais64_merge_compacted_lms_suffixes_32s_omp(T, SA, n, m, f, threads, thread_state); } else { libsais64_gather_lms_suffixes_32s(T, SA, n); libsais64_reconstruct_lms_suffixes_omp(SA, n, m, threads); } } static void libsais64_convert_32u_to_64u(uint32_t * RESTRICT S, uint64_t * RESTRICT D, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size; i < j; i += 1) { D[i] = (uint64_t)S[i]; } } static void libsais64_convert_inplace_32u_to_64u(uint32_t * V, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { fast_sint_t i, j; for (i = omp_block_start + omp_block_size - 1, j = omp_block_start; i >= j; i -= 1) { #if defined(__LITTLE_ENDIAN__) V[i + i + 0] = V[i]; V[i + i + 1] = 0; #else V[i + i + 0] = 0; V[i + i + 1] = V[i]; #endif } } static void libsais64_convert_inplace_64u_to_32u(uint32_t * V, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size; i < j; i += 1) { #if defined(__LITTLE_ENDIAN__) V[i] = V[i + i + 0]; #else V[i] = V[i + i + 1]; #endif } } static void libsais64_convert_inplace_32u_to_64u_omp(uint32_t * V, sa_sint_t n, sa_sint_t threads) { while (n >= 65536) { fast_sint_t block_size = n >> 1; n -= block_size; #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (block_size / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : block_size - omp_block_start; libsais64_convert_32u_to_64u(((uint32_t *)(void *)V) + n, ((uint64_t *)(void *)V) + n, omp_block_start, omp_block_size); } } libsais64_convert_inplace_32u_to_64u(V, 0, n); } static sa_sint_t libsais64_main_32s_recursion(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state, sa_sint_t * RESTRICT local_buffer) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; if ((new_fs / k >= 6) || (new_fs / k >= 4 && n <= INT32_MAX / 2) || (new_fs / k < 4 && new_fs >= fs)) { libsais64_convert_inplace_64u_to_32u((uint32_t *)(void *)T, 0, n); #if defined(LIBSAIS_OPENMP) sa_sint_t index = libsais_int_omp((int32_t *)T, (int32_t *)SA, (int32_t)n, (int32_t)k, (int32_t)new_fs, (int32_t)threads); #else sa_sint_t index = libsais_int((int32_t *)T, (int32_t *)SA, (int32_t)n, (int32_t)k, (int32_t)new_fs); #endif if (index >= 0) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, threads); libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)T, n, threads); } return index; } } if (k > 0 && ((fs / k >= 6) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 6))) { sa_sint_t alignment = (fs - 1024) / k >= 6 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 6 ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - 6 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 6 * (fast_sint_t)k]; buckets = (LIBSAIS_LOCAL_BUFFER_SIZE > fs) ? local_buffer : buckets; sa_sint_t m = libsais64_count_and_gather_lms_suffixes_32s_4k_omp(T, SA, n, k, buckets, buckets == local_buffer, threads, thread_state); if (m > 1) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); sa_sint_t first_lms_suffix = SA[n - m]; sa_sint_t left_suffixes_count = libsais64_initialize_buckets_for_lms_suffixes_radix_sort_32s_6k(T, k, buckets, first_lms_suffix); libsais64_radix_sort_lms_suffixes_32s_6k_omp(T, SA, n, m, &buckets[4 * (fast_sint_t)k], threads, thread_state); if ((n / 8192) < k) { libsais64_radix_sort_set_markers_32s_6k_omp(SA, k, &buckets[4 * (fast_sint_t)k], threads); } if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } libsais64_initialize_buckets_for_partial_sorting_32s_6k(T, k, buckets, first_lms_suffix, left_suffixes_count); libsais64_induce_partial_order_32s_6k_omp(T, SA, n, k, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state); sa_sint_t names = (n / 8192) < k ? libsais64_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state) : libsais64_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state); if (names < m) { sa_sint_t f = (n / 8192) < k ? libsais64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state) : 0; if (libsais64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais64_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, buckets == local_buffer, threads, thread_state); } else { libsais64_count_lms_suffixes_32s_2k(T, n, k, buckets); } libsais64_initialize_buckets_start_and_end_32s_4k(k, buckets); libsais64_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); libsais64_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state); } else { SA[0] = SA[n - 1]; libsais64_initialize_buckets_start_and_end_32s_6k(k, buckets); libsais64_place_lms_suffixes_histogram_32s_6k(SA, n, k, m, buckets); libsais64_induce_final_order_32s_6k(T, SA, n, k, buckets, threads, thread_state); } return 0; } else if (k > 0 && (n <= SAINT_MAX / 2) && ((fs / k >= 4) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 4))) { sa_sint_t alignment = (fs - 1024) / k >= 4 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 4 ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - 4 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 4 * (fast_sint_t)k]; buckets = (LIBSAIS_LOCAL_BUFFER_SIZE > fs) ? local_buffer : buckets; sa_sint_t m = libsais64_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, buckets == local_buffer, threads, thread_state); if (m > 1) { libsais64_initialize_buckets_for_radix_and_partial_sorting_32s_4k(T, k, buckets, SA[n - m]); libsais64_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state); libsais64_radix_sort_set_markers_32s_4k_omp(SA, k, &buckets[1], threads); libsais64_place_lms_suffixes_interval_32s_4k(SA, n, k, m - 1, buckets); libsais64_induce_partial_order_32s_4k_omp(T, SA, n, k, buckets, threads, thread_state); sa_sint_t names = libsais64_renumber_and_mark_distinct_lms_suffixes_32s_4k_omp(SA, n, m, threads, thread_state); if (names < m) { sa_sint_t f = libsais64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); if (libsais64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais64_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, buckets == local_buffer, threads, thread_state); } else { libsais64_count_lms_suffixes_32s_2k(T, n, k, buckets); } } else { SA[0] = SA[n - 1]; } libsais64_initialize_buckets_start_and_end_32s_4k(k, buckets); libsais64_place_lms_suffixes_histogram_32s_4k(SA, n, k, m, buckets); libsais64_induce_final_order_32s_4k(T, SA, n, k, buckets, threads, thread_state); return 0; } else if (k > 0 && ((fs / k >= 2) || (LIBSAIS_LOCAL_BUFFER_SIZE / k >= 2))) { sa_sint_t alignment = (fs - 1024) / k >= 2 ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = (fs - alignment) / k >= 2 ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - 2 * (fast_sint_t)k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : &SA[n + fs - 2 * (fast_sint_t)k]; buckets = (LIBSAIS_LOCAL_BUFFER_SIZE > fs) ? local_buffer : buckets; sa_sint_t m = libsais64_count_and_gather_lms_suffixes_32s_2k_omp(T, SA, n, k, buckets, buckets == local_buffer, threads, thread_state); if (m > 1) { libsais64_initialize_buckets_for_lms_suffixes_radix_sort_32s_2k(T, k, buckets, SA[n - m]); libsais64_radix_sort_lms_suffixes_32s_2k_omp(T, SA, n, m, &buckets[1], threads, thread_state); libsais64_place_lms_suffixes_interval_32s_2k(SA, n, k, m - 1, buckets); libsais64_initialize_buckets_start_and_end_32s_2k(k, buckets); libsais64_induce_partial_order_32s_2k_omp(T, SA, n, k, buckets, threads, thread_state); sa_sint_t names = libsais64_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads); if (names < m) { sa_sint_t f = libsais64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); if (libsais64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais64_reconstruct_compacted_lms_suffixes_32s_2k_omp(T, SA, n, k, m, fs, f, buckets, buckets == local_buffer, threads, thread_state); } else { libsais64_count_lms_suffixes_32s_2k(T, n, k, buckets); } } else { SA[0] = SA[n - 1]; } libsais64_initialize_buckets_end_32s_2k(k, buckets); libsais64_place_lms_suffixes_histogram_32s_2k(SA, n, k, m, buckets); libsais64_initialize_buckets_start_and_end_32s_2k(k, buckets); libsais64_induce_final_order_32s_2k(T, SA, n, k, buckets, threads, thread_state); return 0; } else { sa_sint_t * buffer = fs < k ? (sa_sint_t *)libsais64_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096) : (sa_sint_t *)NULL; sa_sint_t alignment = fs - 1024 >= k ? (sa_sint_t)1024 : (sa_sint_t)16; sa_sint_t * RESTRICT buckets = fs - alignment >= k ? (sa_sint_t *)libsais64_align_up(&SA[n + fs - k - alignment], (size_t)alignment * sizeof(sa_sint_t)) : fs >= k ? &SA[n + fs - k] : buffer; if (buckets == NULL) { return -2; } memset(SA, 0, (size_t)n * sizeof(sa_sint_t)); libsais64_count_suffixes_32s(T, n, k, buckets); libsais64_initialize_buckets_end_32s_1k(k, buckets); sa_sint_t m = libsais64_radix_sort_lms_suffixes_32s_1k(T, SA, n, buckets); if (m > 1) { libsais64_induce_partial_order_32s_1k_omp(T, SA, n, k, buckets, threads, thread_state); sa_sint_t names = libsais64_renumber_and_mark_distinct_lms_suffixes_32s_1k_omp(T, SA, n, m, threads); if (names < m) { if (buffer != NULL) { libsais64_free_aligned(buffer); buckets = NULL; } sa_sint_t f = libsais64_compact_lms_suffixes_32s_omp(T, SA, n, m, fs, threads, thread_state); if (libsais64_main_32s_recursion(SA + n + fs - m + f, SA, m - f, names - f, fs + n - 2 * m + f, threads, thread_state, local_buffer) != 0) { return -2; } libsais64_reconstruct_compacted_lms_suffixes_32s_1k_omp(T, SA, n, m, fs, f, threads, thread_state); if (buckets == NULL) { buckets = buffer = (sa_sint_t *)libsais64_alloc_aligned((size_t)k * sizeof(sa_sint_t), 4096); } if (buckets == NULL) { return -2; } } libsais64_count_suffixes_32s(T, n, k, buckets); libsais64_initialize_buckets_end_32s_1k(k, buckets); libsais64_place_lms_suffixes_interval_32s_1k(T, SA, k, m, buckets); } libsais64_induce_final_order_32s_1k(T, SA, n, k, buckets, threads, thread_state); libsais64_free_aligned(buffer); return 0; } } static sa_sint_t libsais64_main_32s_entry(sa_sint_t * RESTRICT T, sa_sint_t * RESTRICT SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { sa_sint_t local_buffer[2 * LIBSAIS_LOCAL_BUFFER_SIZE]; return libsais64_main_32s_recursion(T, SA, n, k, fs, threads, thread_state, local_buffer + LIBSAIS_LOCAL_BUFFER_SIZE); } static sa_sint_t libsais64_main_8u(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t * RESTRICT buckets, sa_sint_t flags, sa_sint_t r, sa_sint_t * RESTRICT I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads, LIBSAIS_THREAD_STATE * RESTRICT thread_state) { fs = fs < (SAINT_MAX - n) ? fs : (SAINT_MAX - n); sa_sint_t m = libsais64_count_and_gather_lms_suffixes_8u_omp(T, SA, n, buckets, threads, thread_state); sa_sint_t k = libsais64_initialize_buckets_start_and_end_8u(buckets, freq); if ((flags & LIBSAIS_FLAGS_GSA) && (buckets[0] != 0 || buckets[2] != 0 || buckets[3] != 1)) { return -1; } if (m > 0) { sa_sint_t first_lms_suffix = SA[n - m]; sa_sint_t left_suffixes_count = libsais64_initialize_buckets_for_lms_suffixes_radix_sort_8u(T, buckets, first_lms_suffix); if (threads > 1 && n >= 65536) { memset(SA, 0, ((size_t)n - (size_t)m) * sizeof(sa_sint_t)); } libsais64_radix_sort_lms_suffixes_8u_omp(T, SA, n, m, flags, buckets, threads, thread_state); if (threads > 1 && n >= 65536) { memset(&SA[(fast_sint_t)n - (fast_sint_t)m], 0, (size_t)m * sizeof(sa_sint_t)); } libsais64_initialize_buckets_for_partial_sorting_8u(T, buckets, first_lms_suffix, left_suffixes_count); libsais64_induce_partial_order_8u_omp(T, SA, n, k, flags, buckets, first_lms_suffix, left_suffixes_count, threads, thread_state); sa_sint_t names = libsais64_renumber_and_gather_lms_suffixes_omp(SA, n, m, fs, threads, thread_state); if (names < m) { if (libsais64_main_32s_entry(SA + n + fs - m, SA, m, names, fs + n - 2 * m, threads, thread_state) != 0) { return -2; } libsais64_gather_lms_suffixes_8u_omp(T, SA, n, threads, thread_state); libsais64_reconstruct_lms_suffixes_omp(SA, n, m, threads); } libsais64_place_lms_suffixes_interval_8u(SA, n, m, flags, buckets); } else { memset(SA, 0, (size_t)n * sizeof(sa_sint_t)); } return libsais64_induce_final_order_8u_omp(T, SA, n, k, flags, r, I, buckets, threads, thread_state); } static sa_sint_t libsais64_main(const uint8_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t flags, sa_sint_t r, sa_sint_t * I, sa_sint_t fs, sa_sint_t * freq, sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais64_alloc_thread_state(threads) : NULL; sa_sint_t * RESTRICT buckets = (sa_sint_t *)libsais64_alloc_aligned((size_t)8 * ALPHABET_SIZE * sizeof(sa_sint_t), 4096); sa_sint_t index = buckets != NULL && (thread_state != NULL || threads == 1) ? libsais64_main_8u(T, SA, n, buckets, flags, r, I, fs, freq, threads, thread_state) : -2; libsais64_free_aligned(buckets); libsais64_free_thread_state(thread_state); return index; } static sa_sint_t libsais64_main_long(sa_sint_t * T, sa_sint_t * SA, sa_sint_t n, sa_sint_t k, sa_sint_t fs, sa_sint_t threads) { LIBSAIS_THREAD_STATE * RESTRICT thread_state = threads > 1 ? libsais64_alloc_thread_state(threads) : NULL; sa_sint_t index = thread_state != NULL || threads == 1 ? libsais64_main_32s_entry(T, SA, n, k, fs, threads, thread_state) : -2; libsais64_free_thread_state(thread_state); return index; } static void libsais64_bwt_copy_8u(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = 0, j = (fast_sint_t)n - 7; i < j; i += 8) { libsais64_prefetchr(&A[i + prefetch_distance]); U[i + 0] = (uint8_t)A[i + 0]; U[i + 1] = (uint8_t)A[i + 1]; U[i + 2] = (uint8_t)A[i + 2]; U[i + 3] = (uint8_t)A[i + 3]; U[i + 4] = (uint8_t)A[i + 4]; U[i + 5] = (uint8_t)A[i + 5]; U[i + 6] = (uint8_t)A[i + 6]; U[i + 7] = (uint8_t)A[i + 7]; } for (j += 7; i < j; i += 1) { U[i] = (uint8_t)A[i]; } } static void libsais64_bwt_copy_8u_omp(uint8_t * RESTRICT U, sa_sint_t * RESTRICT A, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); fast_sint_t omp_block_stride = ((fast_sint_t)n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : (fast_sint_t)n - omp_block_start; #else UNUSED(threads); fast_sint_t omp_block_start = 0; fast_sint_t omp_block_size = (fast_sint_t)n; #endif libsais64_bwt_copy_8u(U + omp_block_start, A + omp_block_start, (sa_sint_t)omp_block_size); } } int64_t libsais64(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais(T, (int32_t *)SA, (int32_t)n, (int32_t)new_fs, (int32_t *)freq); if (index >= 0) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, 1); if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); } } return index; } return libsais64_main(T, SA, n, LIBSAIS_FLAGS_NONE, 0, NULL, fs, freq, 1); } int64_t libsais64_gsa(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq) { if ((T == NULL) || (SA == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais_gsa(T, (int32_t *)SA, (int32_t)n, (int32_t)new_fs, (int32_t *)freq); if (index >= 0) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, 1); if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); } } return index; } return libsais64_main(T, SA, n, LIBSAIS_FLAGS_GSA, 0, NULL, fs, freq, 1); } int64_t libsais64_long(int64_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; } return libsais64_main_long(T, SA, n, k, fs, 1); } int64_t libsais64_bwt(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } return n; } if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais_bwt(T, U, (int32_t *)A, (int32_t)n, (int32_t)new_fs, (int32_t *)freq); if (index >= 0) { if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); } } return index; } sa_sint_t index = libsais64_main(T, A, n, LIBSAIS_FLAGS_BWT, 0, NULL, fs, freq, 1); if (index >= 0) { index++; U[0] = T[n - 1]; libsais64_bwt_copy_8u_omp(U + 1, A, index - 1, 1); libsais64_bwt_copy_8u_omp(U + index, A + index, n - index, 1); } return index; } int64_t libsais64_bwt_aux(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } I[0] = n; return 0; } if (n <= INT32_MAX && r <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais_bwt_aux(T, U, (int32_t *)A, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)r, (int32_t *)I); if (index >= 0) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)I, 1 + ((n - 1) / r), 1); if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, 1); } } return index; } sa_sint_t index = libsais64_main(T, A, n, LIBSAIS_FLAGS_BWT, r, I, fs, freq, 1); if (index == 0) { U[0] = T[n - 1]; libsais64_bwt_copy_8u_omp(U + 1, A, I[0] - 1, 1); libsais64_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0], 1); } return index; } #if defined(LIBSAIS_OPENMP) int64_t libsais64_omp(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq, int64_t threads) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais_omp(T, (int32_t *)SA, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)threads); if (index >= 0) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, threads); if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, threads); } } return index; } return libsais64_main(T, SA, n, LIBSAIS_FLAGS_NONE, 0, NULL, fs, freq, threads); } int64_t libsais64_gsa_omp(const uint8_t * T, int64_t * SA, int64_t n, int64_t fs, int64_t * freq, int64_t threads) { if ((T == NULL) || (SA == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { SA[0] = 0; if (freq != NULL) { freq[T[0]]++; } } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais_gsa_omp(T, (int32_t *)SA, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)threads); if (index >= 0) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)SA, n, threads); if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, threads); } } return index; } return libsais64_main(T, SA, n, LIBSAIS_FLAGS_GSA, 0, NULL, fs, freq, threads); } int64_t libsais64_long_omp(int64_t * T, int64_t * SA, int64_t n, int64_t k, int64_t fs, int64_t threads) { if ((T == NULL) || (SA == NULL) || (n < 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { SA[0] = 0; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return libsais64_main_long(T, SA, n, k, fs, threads); } int64_t libsais64_bwt_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t threads) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } return n; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; if (n <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais_bwt_omp(T, U, (int32_t *)A, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)threads); if (index >= 0) { if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, threads); } } return index; } sa_sint_t index = libsais64_main(T, A, n, LIBSAIS_FLAGS_BWT, 0, NULL, fs, freq, threads); if (index >= 0) { index++; U[0] = T[n - 1]; libsais64_bwt_copy_8u_omp(U + 1, A, index - 1, threads); libsais64_bwt_copy_8u_omp(U + index, A + index, n - index, threads); } return index; } int64_t libsais64_bwt_aux_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, int64_t fs, int64_t * freq, int64_t r, int64_t * I, int64_t threads) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || (fs < 0) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL) || (threads < 0)) { return -1; } else if (n <= 1) { if (freq != NULL) { memset(freq, 0, ALPHABET_SIZE * sizeof(int64_t)); } if (n == 1) { U[0] = T[0]; if (freq != NULL) { freq[T[0]]++; } } I[0] = n; return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; if (n <= INT32_MAX && r <= INT32_MAX) { sa_sint_t new_fs = (fs + fs + n + n) <= INT32_MAX ? (fs + fs + n) : INT32_MAX - n; sa_sint_t index = libsais_bwt_aux_omp(T, U, (int32_t *)A, (int32_t)n, (int32_t)new_fs, (int32_t *)freq, (int32_t)r, (int32_t *)I, (int32_t)threads); if (index >= 0) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)I, 1 + ((n - 1) / r), threads); if (freq != NULL) { libsais64_convert_inplace_32u_to_64u_omp((uint32_t *)freq, ALPHABET_SIZE, threads); } } return index; } sa_sint_t index = libsais64_main(T, A, n, LIBSAIS_FLAGS_BWT, r, I, fs, freq, threads); if (index == 0) { U[0] = T[n - 1]; libsais64_bwt_copy_8u_omp(U + 1, A, I[0] - 1, threads); libsais64_bwt_copy_8u_omp(U + I[0], A + I[0], n - I[0], threads); } return index; } #endif static void libsais64_unbwt_compute_histogram(const uint8_t * RESTRICT T, fast_sint_t n, sa_uint_t * RESTRICT count) { const fast_sint_t prefetch_distance = 256; const uint8_t * RESTRICT T_p = T; if (n >= 1024) { sa_uint_t copy[4 * (ALPHABET_SIZE + 16)]; memset(copy, 0, (size_t)4 * (ALPHABET_SIZE + 16) * sizeof(sa_uint_t)); sa_uint_t * RESTRICT copy0 = copy + 0 * (ALPHABET_SIZE + 16); sa_uint_t * RESTRICT copy1 = copy + 1 * (ALPHABET_SIZE + 16); sa_uint_t * RESTRICT copy2 = copy + 2 * (ALPHABET_SIZE + 16); sa_uint_t * RESTRICT copy3 = copy + 3 * (ALPHABET_SIZE + 16); for (; T_p < (uint8_t * )((ptrdiff_t)(T + 63) & (-64)); T_p += 1) { copy0[T_p[0]]++; } fast_uint_t x = ((const uint32_t *)(const void *)T_p)[0], y = ((const uint32_t *)(const void *)T_p)[1]; for (; T_p < (uint8_t * )((ptrdiff_t)(T + n - 8) & (-64)); T_p += 64) { libsais64_prefetchr(&T_p[prefetch_distance]); fast_uint_t z = ((const uint32_t *)(const void *)T_p)[2], w = ((const uint32_t *)(const void *)T_p)[3]; copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; x = ((const uint32_t *)(const void *)T_p)[4]; y = ((const uint32_t *)(const void *)T_p)[5]; copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++; copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++; z = ((const uint32_t *)(const void *)T_p)[6]; w = ((const uint32_t *)(const void *)T_p)[7]; copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; x = ((const uint32_t *)(const void *)T_p)[8]; y = ((const uint32_t *)(const void *)T_p)[9]; copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++; copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++; z = ((const uint32_t *)(const void *)T_p)[10]; w = ((const uint32_t *)(const void *)T_p)[11]; copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; x = ((const uint32_t *)(const void *)T_p)[12]; y = ((const uint32_t *)(const void *)T_p)[13]; copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++; copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++; z = ((const uint32_t *)(const void *)T_p)[14]; w = ((const uint32_t *)(const void *)T_p)[15]; copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; x = ((const uint32_t *)(const void *)T_p)[16]; y = ((const uint32_t *)(const void *)T_p)[17]; copy0[(uint8_t)z]++; z >>= 8; copy1[(uint8_t)z]++; z >>= 8; copy2[(uint8_t)z]++; z >>= 8; copy3[z]++; copy0[(uint8_t)w]++; w >>= 8; copy1[(uint8_t)w]++; w >>= 8; copy2[(uint8_t)w]++; w >>= 8; copy3[w]++; } copy0[(uint8_t)x]++; x >>= 8; copy1[(uint8_t)x]++; x >>= 8; copy2[(uint8_t)x]++; x >>= 8; copy3[x]++; copy0[(uint8_t)y]++; y >>= 8; copy1[(uint8_t)y]++; y >>= 8; copy2[(uint8_t)y]++; y >>= 8; copy3[y]++; T_p += 8; fast_uint_t i; for (i = 0; i < ALPHABET_SIZE; i++) { count[i] += copy0[i] + copy1[i] + copy2[i] + copy3[i]; } } for (; T_p < T + n; T_p += 1) { count[T_p[0]]++; } } static void libsais64_unbwt_transpose_bucket2(sa_uint_t * RESTRICT bucket2) { fast_uint_t x, y, c, d; for (x = 0; x != ALPHABET_SIZE; x += 16) { for (c = x; c != x + 16; ++c) { for (d = c + 1; d != x + 16; ++d) { sa_uint_t tmp = bucket2[(d << 8) + c]; bucket2[(d << 8) + c] = bucket2[(c << 8) + d]; bucket2[(c << 8) + d] = tmp; } } for (y = x + 16; y != ALPHABET_SIZE; y += 16) { for (c = x; c != x + 16; ++c) { sa_uint_t * bucket2_yc = &bucket2[(y << 8) + c]; sa_uint_t * bucket2_cy = &bucket2[(c << 8) + y]; sa_uint_t tmp00 = bucket2_yc[ 0 * 256]; bucket2_yc[ 0 * 256] = bucket2_cy[ 0]; bucket2_cy[ 0] = tmp00; sa_uint_t tmp01 = bucket2_yc[ 1 * 256]; bucket2_yc[ 1 * 256] = bucket2_cy[ 1]; bucket2_cy[ 1] = tmp01; sa_uint_t tmp02 = bucket2_yc[ 2 * 256]; bucket2_yc[ 2 * 256] = bucket2_cy[ 2]; bucket2_cy[ 2] = tmp02; sa_uint_t tmp03 = bucket2_yc[ 3 * 256]; bucket2_yc[ 3 * 256] = bucket2_cy[ 3]; bucket2_cy[ 3] = tmp03; sa_uint_t tmp04 = bucket2_yc[ 4 * 256]; bucket2_yc[ 4 * 256] = bucket2_cy[ 4]; bucket2_cy[ 4] = tmp04; sa_uint_t tmp05 = bucket2_yc[ 5 * 256]; bucket2_yc[ 5 * 256] = bucket2_cy[ 5]; bucket2_cy[ 5] = tmp05; sa_uint_t tmp06 = bucket2_yc[ 6 * 256]; bucket2_yc[ 6 * 256] = bucket2_cy[ 6]; bucket2_cy[ 6] = tmp06; sa_uint_t tmp07 = bucket2_yc[ 7 * 256]; bucket2_yc[ 7 * 256] = bucket2_cy[ 7]; bucket2_cy[ 7] = tmp07; sa_uint_t tmp08 = bucket2_yc[ 8 * 256]; bucket2_yc[ 8 * 256] = bucket2_cy[ 8]; bucket2_cy[ 8] = tmp08; sa_uint_t tmp09 = bucket2_yc[ 9 * 256]; bucket2_yc[ 9 * 256] = bucket2_cy[ 9]; bucket2_cy[ 9] = tmp09; sa_uint_t tmp10 = bucket2_yc[10 * 256]; bucket2_yc[10 * 256] = bucket2_cy[10]; bucket2_cy[10] = tmp10; sa_uint_t tmp11 = bucket2_yc[11 * 256]; bucket2_yc[11 * 256] = bucket2_cy[11]; bucket2_cy[11] = tmp11; sa_uint_t tmp12 = bucket2_yc[12 * 256]; bucket2_yc[12 * 256] = bucket2_cy[12]; bucket2_cy[12] = tmp12; sa_uint_t tmp13 = bucket2_yc[13 * 256]; bucket2_yc[13 * 256] = bucket2_cy[13]; bucket2_cy[13] = tmp13; sa_uint_t tmp14 = bucket2_yc[14 * 256]; bucket2_yc[14 * 256] = bucket2_cy[14]; bucket2_cy[14] = tmp14; sa_uint_t tmp15 = bucket2_yc[15 * 256]; bucket2_yc[15 * 256] = bucket2_cy[15]; bucket2_cy[15] = tmp15; } } } } static void libsais64_unbwt_compute_bigram_histogram_single(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_uint_t index) { fast_uint_t sum, c; for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) { fast_uint_t prev = sum; sum += bucket1[c]; bucket1[c] = (sa_uint_t)prev; if (prev != sum) { sa_uint_t * RESTRICT bucket2_p = &bucket2[c << 8]; { fast_uint_t hi = index; if (sum < hi) { hi = sum; } libsais64_unbwt_compute_histogram(&T[prev], (fast_sint_t)(hi - prev), bucket2_p); } { fast_uint_t lo = index + 1; if (prev > lo) { lo = prev; } libsais64_unbwt_compute_histogram(&T[lo - 1], (fast_sint_t)(sum - lo), bucket2_p); } } } libsais64_unbwt_transpose_bucket2(bucket2); } static void libsais64_unbwt_calculate_fastbits(sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t lastc, fast_uint_t shift) { fast_uint_t v, w, sum, c, d; for (v = 0, w = 0, sum = 1, c = 0; c < ALPHABET_SIZE; ++c) { if (c == lastc) { sum += 1; } for (d = 0; d < ALPHABET_SIZE; ++d, ++w) { fast_uint_t prev = sum; sum += bucket2[w]; bucket2[w] = (sa_uint_t)prev; if (prev != sum) { for (; v <= ((sum - 1) >> shift); ++v) { fastbits[v] = (uint16_t)w; } } } } } static void libsais64_unbwt_calculate_biPSI(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_uint_t index, fast_sint_t omp_block_start, fast_sint_t omp_block_end) { { fast_sint_t i = omp_block_start, j = (fast_sint_t)index; if (omp_block_end < j) { j = omp_block_end; } for (; i < j; ++i) { fast_uint_t c = T[i]; fast_uint_t p = bucket1[c]++; fast_sint_t t = (fast_sint_t)(index - p); if (t != 0) { fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c; P[bucket2[w]++] = (sa_uint_t)i; } } } { fast_sint_t i = (fast_sint_t)index, j = omp_block_end; if (omp_block_start > i) { i = omp_block_start; } for (i += 1; i <= j; ++i) { fast_uint_t c = T[i - 1]; fast_uint_t p = bucket1[c]++; fast_sint_t t = (fast_sint_t)(index - p); if (t != 0) { fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c; P[bucket2[w]++] = (sa_uint_t)i; } } } } static void libsais64_unbwt_init_single(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits) { sa_uint_t bucket1[ALPHABET_SIZE]; fast_uint_t index = I[0]; fast_uint_t lastc = T[0]; fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } if (freq != NULL) { memcpy(bucket1, freq, ALPHABET_SIZE * sizeof(sa_uint_t)); } else { memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); libsais64_unbwt_compute_histogram(T, n, bucket1); } memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); libsais64_unbwt_compute_bigram_histogram_single(T, bucket1, bucket2, index); libsais64_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift); libsais64_unbwt_calculate_biPSI(T, P, bucket1, bucket2, index, 0, n); } #if defined(LIBSAIS_OPENMP) static void libsais64_unbwt_compute_bigram_histogram_parallel(const uint8_t * RESTRICT T, fast_uint_t index, sa_uint_t * RESTRICT bucket1, sa_uint_t * RESTRICT bucket2, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { fast_sint_t i; for (i = omp_block_start; i < omp_block_start + omp_block_size; ++i) { fast_uint_t c = T[i]; fast_uint_t p = bucket1[c]++; fast_sint_t t = (fast_sint_t)(index - p); if (t != 0) { fast_uint_t w = (((fast_uint_t)T[p + (fast_uint_t)(t >> ((sizeof(fast_sint_t) * 8) - 1))]) << 8) + c; bucket2[w]++; } } } static void libsais64_unbwt_init_parallel(const uint8_t * RESTRICT T, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads) { sa_uint_t bucket1[ALPHABET_SIZE]; fast_uint_t index = I[0]; fast_uint_t lastc = T[0]; fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } memset(bucket1, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); memset(bucket2, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) { fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); if (omp_num_threads == 1) { libsais64_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); } else { sa_uint_t * RESTRICT bucket1_local = buckets + omp_thread_num * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)); sa_uint_t * RESTRICT bucket2_local = bucket1_local + ALPHABET_SIZE; fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; { memset(bucket1_local, 0, ALPHABET_SIZE * sizeof(sa_uint_t)); libsais64_unbwt_compute_histogram(T + omp_block_start, omp_block_size, bucket1_local); } #pragma omp barrier #pragma omp master { { sa_uint_t * RESTRICT bucket1_temp = buckets; fast_sint_t t; for (t = 0; t < omp_num_threads; ++t, bucket1_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) { fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket1[c], B = bucket1_temp[c]; bucket1[c] = A + B; bucket1_temp[c] = A; } } } { fast_uint_t sum, c; for (sum = 1, c = 0; c < ALPHABET_SIZE; ++c) { fast_uint_t prev = sum; sum += bucket1[c]; bucket1[c] = (sa_uint_t)prev; } } } #pragma omp barrier { fast_sint_t c; for (c = 0; c < ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket1[c], B = bucket1_local[c]; bucket1_local[c] = A + B; } memset(bucket2_local, 0, ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); libsais64_unbwt_compute_bigram_histogram_parallel(T, index, bucket1_local, bucket2_local, omp_block_start, omp_block_size); } #pragma omp barrier { fast_sint_t omp_bucket2_stride = ((ALPHABET_SIZE * ALPHABET_SIZE) / omp_num_threads) & (-16); fast_sint_t omp_bucket2_start = omp_thread_num * omp_bucket2_stride; fast_sint_t omp_bucket2_size = omp_thread_num < omp_num_threads - 1 ? omp_bucket2_stride : (ALPHABET_SIZE * ALPHABET_SIZE) - omp_bucket2_start; sa_uint_t * RESTRICT bucket2_temp = buckets + ALPHABET_SIZE; fast_sint_t t; for (t = 0; t < omp_num_threads; ++t, bucket2_temp += ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) { fast_sint_t c; for (c = omp_bucket2_start; c < omp_bucket2_start + omp_bucket2_size; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_temp[c]; bucket2[c] = A + B; bucket2_temp[c] = A; } } } #pragma omp barrier #pragma omp master { libsais64_unbwt_calculate_fastbits(bucket2, fastbits, lastc, shift); { fast_sint_t t; for (t = omp_num_threads - 1; t >= 1; --t) { sa_uint_t * RESTRICT dst_bucket1 = buckets + t * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)); sa_uint_t * RESTRICT src_bucket1 = dst_bucket1 - (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)); memcpy(dst_bucket1, src_bucket1, ALPHABET_SIZE * sizeof(sa_uint_t)); } memcpy(buckets, bucket1, ALPHABET_SIZE * sizeof(sa_uint_t)); } } #pragma omp barrier { fast_sint_t c; for (c = 0; c < ALPHABET_SIZE * ALPHABET_SIZE; c += 1) { sa_uint_t A = bucket2[c], B = bucket2_local[c]; bucket2_local[c] = A + B; } libsais64_unbwt_calculate_biPSI(T, P, bucket1_local, bucket2_local, index, omp_block_start, omp_block_start + omp_block_size); } #pragma omp barrier #pragma omp master { memcpy(bucket2, buckets + ALPHABET_SIZE + (omp_num_threads - 1) * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)), ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t)); } } } } #endif static void libsais64_unbwt_decode_1(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t * i0, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; fast_uint_t i, p0 = *i0; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais64_bswap16(c0); } *i0 = p0; } static void libsais64_unbwt_decode_2(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); fast_uint_t i, p0 = *i0, p1 = *i1; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais64_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais64_bswap16(c1); } *i0 = p0; *i1 = p1; } static void libsais64_unbwt_decode_3(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais64_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais64_bswap16(c1); uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais64_bswap16(c2); } *i0 = p0; *i1 = p1; *i2 = p2; } static void libsais64_unbwt_decode_4(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais64_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais64_bswap16(c1); uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais64_bswap16(c2); uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais64_bswap16(c3); } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; } static void libsais64_unbwt_decode_5(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r); fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais64_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais64_bswap16(c1); uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais64_bswap16(c2); uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais64_bswap16(c3); uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais64_bswap16(c4); } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; } static void libsais64_unbwt_decode_6(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r); uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r); fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais64_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais64_bswap16(c1); uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais64_bswap16(c2); uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais64_bswap16(c3); uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais64_bswap16(c4); uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais64_bswap16(c5); } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; } static void libsais64_unbwt_decode_7(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r); uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r); uint16_t * RESTRICT U6 = (uint16_t *)(void *)(((uint8_t *)U5) + r); fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais64_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais64_bswap16(c1); uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais64_bswap16(c2); uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais64_bswap16(c3); uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais64_bswap16(c4); uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais64_bswap16(c5); uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = libsais64_bswap16(c6); } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; } static void libsais64_unbwt_decode_8(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_uint_t shift, fast_uint_t r, fast_uint_t * i0, fast_uint_t * i1, fast_uint_t * i2, fast_uint_t * i3, fast_uint_t * i4, fast_uint_t * i5, fast_uint_t * i6, fast_uint_t * i7, fast_uint_t k) { uint16_t * RESTRICT U0 = (uint16_t *)(void *)U; uint16_t * RESTRICT U1 = (uint16_t *)(void *)(((uint8_t *)U0) + r); uint16_t * RESTRICT U2 = (uint16_t *)(void *)(((uint8_t *)U1) + r); uint16_t * RESTRICT U3 = (uint16_t *)(void *)(((uint8_t *)U2) + r); uint16_t * RESTRICT U4 = (uint16_t *)(void *)(((uint8_t *)U3) + r); uint16_t * RESTRICT U5 = (uint16_t *)(void *)(((uint8_t *)U4) + r); uint16_t * RESTRICT U6 = (uint16_t *)(void *)(((uint8_t *)U5) + r); uint16_t * RESTRICT U7 = (uint16_t *)(void *)(((uint8_t *)U6) + r); fast_uint_t i, p0 = *i0, p1 = *i1, p2 = *i2, p3 = *i3, p4 = *i4, p5 = *i5, p6 = *i6, p7 = *i7; for (i = 0; i != k; ++i) { uint16_t c0 = fastbits[p0 >> shift]; if (bucket2[c0] <= p0) { do { c0++; } while (bucket2[c0] <= p0); } p0 = P[p0]; U0[i] = libsais64_bswap16(c0); uint16_t c1 = fastbits[p1 >> shift]; if (bucket2[c1] <= p1) { do { c1++; } while (bucket2[c1] <= p1); } p1 = P[p1]; U1[i] = libsais64_bswap16(c1); uint16_t c2 = fastbits[p2 >> shift]; if (bucket2[c2] <= p2) { do { c2++; } while (bucket2[c2] <= p2); } p2 = P[p2]; U2[i] = libsais64_bswap16(c2); uint16_t c3 = fastbits[p3 >> shift]; if (bucket2[c3] <= p3) { do { c3++; } while (bucket2[c3] <= p3); } p3 = P[p3]; U3[i] = libsais64_bswap16(c3); uint16_t c4 = fastbits[p4 >> shift]; if (bucket2[c4] <= p4) { do { c4++; } while (bucket2[c4] <= p4); } p4 = P[p4]; U4[i] = libsais64_bswap16(c4); uint16_t c5 = fastbits[p5 >> shift]; if (bucket2[c5] <= p5) { do { c5++; } while (bucket2[c5] <= p5); } p5 = P[p5]; U5[i] = libsais64_bswap16(c5); uint16_t c6 = fastbits[p6 >> shift]; if (bucket2[c6] <= p6) { do { c6++; } while (bucket2[c6] <= p6); } p6 = P[p6]; U6[i] = libsais64_bswap16(c6); uint16_t c7 = fastbits[p7 >> shift]; if (bucket2[c7] <= p7) { do { c7++; } while (bucket2[c7] <= p7); } p7 = P[p7]; U7[i] = libsais64_bswap16(c7); } *i0 = p0; *i1 = p1; *i2 = p2; *i3 = p3; *i4 = p4; *i5 = p5; *i6 = p6; *i7 = p7; } static void libsais64_unbwt_decode(uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, fast_sint_t blocks, fast_uint_t remainder) { fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } fast_uint_t offset = 0; while (blocks > 8) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7]; libsais64_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, (fast_uint_t)r >> 1); I += 8; blocks -= 8; offset += 8 * (fast_uint_t)r; } if (blocks == 1) { fast_uint_t i0 = I[0]; libsais64_unbwt_decode_1(U + offset, P, bucket2, fastbits, shift, &i0, remainder >> 1); } else if (blocks == 2) { fast_uint_t i0 = I[0], i1 = I[1]; libsais64_unbwt_decode_2(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, remainder >> 1); libsais64_unbwt_decode_1(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, &i0, ((fast_uint_t)r >> 1) - (remainder >> 1)); } else if (blocks == 3) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2]; libsais64_unbwt_decode_3(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, remainder >> 1); libsais64_unbwt_decode_2(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, ((fast_uint_t)r >> 1) - (remainder >> 1)); } else if (blocks == 4) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3]; libsais64_unbwt_decode_4(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, remainder >> 1); libsais64_unbwt_decode_3(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, ((fast_uint_t)r >> 1) - (remainder >> 1)); } else if (blocks == 5) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4]; libsais64_unbwt_decode_5(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, remainder >> 1); libsais64_unbwt_decode_4(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, ((fast_uint_t)r >> 1) - (remainder >> 1)); } else if (blocks == 6) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5]; libsais64_unbwt_decode_6(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, remainder >> 1); libsais64_unbwt_decode_5(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, ((fast_uint_t)r >> 1) - (remainder >> 1)); } else if (blocks == 7) { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6]; libsais64_unbwt_decode_7(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, remainder >> 1); libsais64_unbwt_decode_6(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, ((fast_uint_t)r >> 1) - (remainder >> 1)); } else { fast_uint_t i0 = I[0], i1 = I[1], i2 = I[2], i3 = I[3], i4 = I[4], i5 = I[5], i6 = I[6], i7 = I[7]; libsais64_unbwt_decode_8(U + offset, P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, &i7, remainder >> 1); libsais64_unbwt_decode_7(U + offset + 2 * (remainder >> 1), P, bucket2, fastbits, shift, (fast_uint_t)r, &i0, &i1, &i2, &i3, &i4, &i5, &i6, ((fast_uint_t)r >> 1) - (remainder >> 1)); } } static void libsais64_unbwt_decode_omp(const uint8_t * RESTRICT T, uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_sint_t threads) { fast_uint_t lastc = T[0]; fast_sint_t blocks = 1 + (((fast_sint_t)n - 1) / (fast_sint_t)r); fast_uint_t remainder = (fast_uint_t)n - ((fast_uint_t)r * ((fast_uint_t)blocks - 1)); #if defined(LIBSAIS_OPENMP) fast_sint_t max_threads = blocks < threads ? blocks : threads; #pragma omp parallel num_threads(max_threads) if(max_threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = blocks / omp_num_threads; fast_sint_t omp_block_remainder = blocks % omp_num_threads; fast_sint_t omp_block_size = omp_block_stride + (omp_thread_num < omp_block_remainder); fast_sint_t omp_block_start = omp_block_stride * omp_thread_num + (omp_thread_num < omp_block_remainder ? omp_thread_num : omp_block_remainder); libsais64_unbwt_decode(U + r * omp_block_start, P, n, r, I + omp_block_start, bucket2, fastbits, omp_block_size, omp_thread_num < omp_num_threads - 1 ? (fast_uint_t)r : remainder); } U[n - 1] = (uint8_t)lastc; } static sa_sint_t libsais64_unbwt_core(const uint8_t * RESTRICT T, uint8_t * RESTRICT U, sa_uint_t * RESTRICT P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * RESTRICT I, sa_uint_t * RESTRICT bucket2, uint16_t * RESTRICT fastbits, sa_uint_t * RESTRICT buckets, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) if (threads > 1 && n >= 262144) { libsais64_unbwt_init_parallel(T, P, n, freq, I, bucket2, fastbits, buckets, threads); } else #else UNUSED(buckets); #endif { libsais64_unbwt_init_single(T, P, n, freq, I, bucket2, fastbits); } libsais64_unbwt_decode_omp(T, U, P, n, r, I, bucket2, fastbits, threads); return 0; } static sa_sint_t libsais64_unbwt_main(const uint8_t * T, uint8_t * U, sa_uint_t * P, sa_sint_t n, const sa_sint_t * freq, sa_sint_t r, const sa_uint_t * I, sa_sint_t threads) { fast_uint_t shift = 0; while ((n >> shift) > ((sa_sint_t)1 << UNBWT_FASTBITS)) { shift++; } sa_uint_t * RESTRICT bucket2 = (sa_uint_t *)libsais64_alloc_aligned(ALPHABET_SIZE * ALPHABET_SIZE * sizeof(sa_uint_t), 4096); uint16_t * RESTRICT fastbits = (uint16_t *)libsais64_alloc_aligned(((size_t)1 + (size_t)(n >> shift)) * sizeof(uint16_t), 4096); sa_uint_t * RESTRICT buckets = threads > 1 && n >= 262144 ? (sa_uint_t *)libsais64_alloc_aligned((size_t)threads * (ALPHABET_SIZE + (ALPHABET_SIZE * ALPHABET_SIZE)) * sizeof(sa_uint_t), 4096) : NULL; sa_sint_t index = bucket2 != NULL && fastbits != NULL && (buckets != NULL || threads == 1 || n < 262144) ? libsais64_unbwt_core(T, U, P, n, freq, r, I, bucket2, fastbits, buckets, threads) : -2; libsais64_free_aligned(buckets); libsais64_free_aligned(fastbits); libsais64_free_aligned(bucket2); return index; } int64_t libsais64_unbwt(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i) { return libsais64_unbwt_aux(T, U, A, n, freq, n, &i); } int64_t libsais64_unbwt_aux(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL)) { return -1; } else if (n <= 1) { if (I[0] != n) { return -1; } if (n == 1) { U[0] = T[0]; } return 0; } fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } if (n <= INT32_MAX && r <= INT32_MAX && (n - 1) / r < 1024) { int32_t indexes[1024]; for (t = 0; t <= (n - 1) / r; ++t) { indexes[t] = (int32_t)I[t]; } int32_t frequencies[ALPHABET_SIZE]; if (freq != NULL) { for (t = 0; t < ALPHABET_SIZE; ++t) { frequencies[t] = (int32_t)freq[t]; } } return libsais_unbwt_aux(T, U, (int32_t *)A, (int32_t)n, freq != NULL ? frequencies : NULL, (int32_t)r, indexes); } return libsais64_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, 1); } #if defined(LIBSAIS_OPENMP) int64_t libsais64_unbwt_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t i, int64_t threads) { return libsais64_unbwt_aux_omp(T, U, A, n, freq, n, &i, threads); } int64_t libsais64_unbwt_aux_omp(const uint8_t * T, uint8_t * U, int64_t * A, int64_t n, const int64_t * freq, int64_t r, const int64_t * I, int64_t threads) { if ((T == NULL) || (U == NULL) || (A == NULL) || (n < 0) || ((r != n) && ((r < 2) || ((r & (r - 1)) != 0))) || (I == NULL) || (threads < 0)) { return -1; } else if (n <= 1) { if (I[0] != n) { return -1; } if (n == 1) { U[0] = T[0]; } return 0; } fast_sint_t t; for (t = 0; t <= (n - 1) / r; ++t) { if (I[t] <= 0 || I[t] > n) { return -1; } } if (n <= INT32_MAX && r <= INT32_MAX && (n - 1) / r < 1024) { int32_t indexes[1024]; for (t = 0; t <= (n - 1) / r; ++t) { indexes[t] = (int32_t)I[t]; } int32_t frequencies[ALPHABET_SIZE]; if (freq != NULL) { for (t = 0; t < ALPHABET_SIZE; ++t) { frequencies[t] = (int32_t)freq[t]; } } return libsais_unbwt_aux_omp(T, U, (int32_t *)A, (int32_t)n, freq != NULL ? frequencies : NULL,(int32_t)r, indexes, (int32_t)threads); } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; return libsais64_unbwt_main(T, U, (sa_uint_t *)A, n, freq, r, (const sa_uint_t *)I, threads); } #endif static void libsais64_compute_phi(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; sa_sint_t k = omp_block_start > 0 ? SA[omp_block_start - 1] : n; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais64_prefetchw(&PLCP[SA[i + prefetch_distance + 0]]); libsais64_prefetchw(&PLCP[SA[i + prefetch_distance + 1]]); PLCP[SA[i + 0]] = k; k = SA[i + 0]; PLCP[SA[i + 1]] = k; k = SA[i + 1]; libsais64_prefetchw(&PLCP[SA[i + prefetch_distance + 2]]); libsais64_prefetchw(&PLCP[SA[i + prefetch_distance + 3]]); PLCP[SA[i + 2]] = k; k = SA[i + 2]; PLCP[SA[i + 3]] = k; k = SA[i + 3]; } for (j += prefetch_distance + 3; i < j; i += 1) { PLCP[SA[i]] = k; k = SA[i]; } } static void libsais64_compute_phi_omp(const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais64_compute_phi(SA, PLCP, n, omp_block_start, omp_block_size); } } static void libsais64_compute_plcp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t n, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) { libsais64_prefetchw(&PLCP[i + 2 * prefetch_distance]); libsais64_prefetchr(&T[PLCP[i + prefetch_distance] + l]); fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); while (l < m && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } for (j += prefetch_distance; i < j; i += 1) { fast_sint_t k = PLCP[i], m = n - (i > k ? i : k); while (l < m && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } } static void libsais64_compute_plcp_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais64_compute_plcp(T, PLCP, n, omp_block_start, omp_block_size); } } static void libsais64_compute_plcp_gsa(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j, l = 0; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance; i < j; i += 1) { libsais64_prefetchw(&PLCP[i + 2 * prefetch_distance]); libsais64_prefetchr(&T[PLCP[i + prefetch_distance] + l]); fast_sint_t k = PLCP[i]; while (T[i + l] > 0 && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } for (j += prefetch_distance; i < j; i += 1) { fast_sint_t k = PLCP[i]; while (T[i + l] > 0 && T[i + l] == T[k + l]) { l++; } PLCP[i] = (sa_sint_t)l; l -= (l != 0); } } static void libsais64_compute_plcp_gsa_omp(const uint8_t * RESTRICT T, sa_sint_t * RESTRICT PLCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais64_compute_plcp_gsa(T, PLCP, omp_block_start, omp_block_size); } } static void libsais64_compute_lcp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, fast_sint_t omp_block_start, fast_sint_t omp_block_size) { const fast_sint_t prefetch_distance = 64; fast_sint_t i, j; for (i = omp_block_start, j = omp_block_start + omp_block_size - prefetch_distance - 3; i < j; i += 4) { libsais64_prefetchr(&SA[i + 2 * prefetch_distance]); libsais64_prefetchw(&LCP[i + prefetch_distance]); libsais64_prefetchr(&PLCP[SA[i + prefetch_distance + 0]]); libsais64_prefetchr(&PLCP[SA[i + prefetch_distance + 1]]); LCP[i + 0] = PLCP[SA[i + 0]]; LCP[i + 1] = PLCP[SA[i + 1]]; libsais64_prefetchr(&PLCP[SA[i + prefetch_distance + 2]]); libsais64_prefetchr(&PLCP[SA[i + prefetch_distance + 3]]); LCP[i + 2] = PLCP[SA[i + 2]]; LCP[i + 3] = PLCP[SA[i + 3]]; } for (j += prefetch_distance + 3; i < j; i += 1) { LCP[i] = PLCP[SA[i]]; } } static void libsais64_compute_lcp_omp(const sa_sint_t * RESTRICT PLCP, const sa_sint_t * RESTRICT SA, sa_sint_t * RESTRICT LCP, sa_sint_t n, sa_sint_t threads) { #if defined(LIBSAIS_OPENMP) #pragma omp parallel num_threads(threads) if(threads > 1 && n >= 65536) #endif { #if defined(LIBSAIS_OPENMP) fast_sint_t omp_thread_num = omp_get_thread_num(); fast_sint_t omp_num_threads = omp_get_num_threads(); #else UNUSED(threads); fast_sint_t omp_thread_num = 0; fast_sint_t omp_num_threads = 1; #endif fast_sint_t omp_block_stride = (n / omp_num_threads) & (-16); fast_sint_t omp_block_start = omp_thread_num * omp_block_stride; fast_sint_t omp_block_size = omp_thread_num < omp_num_threads - 1 ? omp_block_stride : n - omp_block_start; libsais64_compute_lcp(PLCP, SA, LCP, omp_block_start, omp_block_size); } } int64_t libsais64_plcp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } libsais64_compute_phi_omp(SA, PLCP, n, 1); libsais64_compute_plcp_omp(T, PLCP, n, 1); return 0; } int64_t libsais64_plcp_gsa(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } libsais64_compute_phi_omp(SA, PLCP, n, 1); libsais64_compute_plcp_gsa_omp(T, PLCP, n, 1); return 0; } int64_t libsais64_lcp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n) { if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0)) { return -1; } else if (n <= 1) { if (n == 1) { LCP[0] = PLCP[SA[0]]; } return 0; } libsais64_compute_lcp_omp(PLCP, SA, LCP, n, 1); return 0; } #if defined(LIBSAIS_OPENMP) int64_t libsais64_plcp_omp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; libsais64_compute_phi_omp(SA, PLCP, n, threads); libsais64_compute_plcp_omp(T, PLCP, n, threads); return 0; } int64_t libsais64_plcp_gsa_omp(const uint8_t * T, const int64_t * SA, int64_t * PLCP, int64_t n, int64_t threads) { if ((T == NULL) || (SA == NULL) || (PLCP == NULL) || (n < 0) || (n > 0 && T[n - 1] != 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { PLCP[0] = 0; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; libsais64_compute_phi_omp(SA, PLCP, n, threads); libsais64_compute_plcp_gsa_omp(T, PLCP, n, threads); return 0; } int64_t libsais64_lcp_omp(const int64_t * PLCP, const int64_t * SA, int64_t * LCP, int64_t n, int64_t threads) { if ((PLCP == NULL) || (SA == NULL) || (LCP == NULL) || (n < 0) || (threads < 0)) { return -1; } else if (n <= 1) { if (n == 1) { LCP[0] = PLCP[SA[0]]; } return 0; } threads = threads > 0 ? threads : (omp_get_max_threads() / omp_get_num_threads()); threads = threads > 0 ? threads : 1; libsais64_compute_lcp_omp(PLCP, SA, LCP, n, threads); return 0; } #endif